diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
commit | 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch) | |
tree | 5e910f0e82173f4ef4f51111366a3f1299037a7b /kernel/trace |
Initial import
Diffstat (limited to 'kernel/trace')
44 files changed, 44621 insertions, 0 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 000000000..3b9a48ae1 --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,641 @@ +# +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: +# + +config USER_STACKTRACE_SUPPORT + bool + +config NOP_TRACER + bool + +config HAVE_FTRACE_NMI_ENTER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_TRACER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_GRAPH_TRACER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_GRAPH_FP_TEST + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_DYNAMIC_FTRACE + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + +config HAVE_FTRACE_MCOUNT_RECORD + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_SYSCALL_TRACEPOINTS + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + +config TRACER_MAX_TRACE + bool + +config TRACE_CLOCK + bool + +config RING_BUFFER + bool + select TRACE_CLOCK + select IRQ_WORK + +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + +config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + bool + +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hiding of the automatic options. + +config TRACING + bool + select DEBUG_FS + select RING_BUFFER + select STACKTRACE if STACKTRACE_SUPPORT + select TRACEPOINTS + select NOP_TRACER + select BINARY_PRINTF + select EVENT_TRACING + select TRACE_CLOCK + +config GENERIC_TRACER + bool + select TRACING + +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 + depends on STACKTRACE_SUPPORT + default y + +if TRACING_SUPPORT + +menuconfig FTRACE + bool "Tracers" + default y if DEBUG_KERNEL + help + Enable the kernel tracing infrastructure. + +if FTRACE + +config FUNCTION_TRACER + bool "Kernel Function Tracer" + depends on HAVE_FUNCTION_TRACER + select KALLSYMS + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction at the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. + +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER + depends on FUNCTION_TRACER + depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE + default y + help + Enable the kernel to trace a function at both its return + and its entry. + Its first purpose is to trace the duration of functions and + draw a call graph for each thread with some information like + the return value. This is done by setting the current return + address on the current task structure into a stack of calls. + + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on !ARCH_USES_GETTIMEOFFSET + select TRACE_IRQFLAGS + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on !ARCH_USES_GETTIMEOFFSET + depends on PREEMPT + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP + help + This option measures the time spent in preemption-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + select TRACER_SNAPSHOT + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" + depends on !GENERIC_TRACER + select TRACING + help + This tracer hooks to various trace points in the kernel, + allowing the user to pick and choose which trace point they + want to trace. It also includes the sched_switch tracer plugin. + +config FTRACE_SYSCALLS + bool "Trace syscalls" + depends on HAVE_SYSCALL_TRACEPOINTS + select GENERIC_TRACER + select KALLSYMS + help + Basic tracer to catch the syscall entry and exit events. + +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + +config TRACE_BRANCH_PROFILING + bool + select GENERIC_TRACER + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if-statement in the + kernel. This profiler will also enable the likely/unlikely + profiler. + + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING + help + This tracer profiles all likely and unlikely macros + in the kernel. It will display the results in: + + /sys/kernel/debug/tracing/trace_stat/branch_annotated + + Note: this will add a significant overhead; only turn this + on if you need to profile the system's use of these macros. + +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" + select TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /sys/kernel/debug/tracing/trace_stat/branch_all + + This option also enables the likely/unlikely profiler. + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed in much detail. +endchoice + +config TRACING_BRANCHES + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config BRANCH_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS + help + This special tracer records the maximum stack footprint of the + kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled + + Say N if unsure. + +config BLK_DEV_IO_TRACE + bool "Support for tracing block IO actions" + depends on SYSFS + depends on BLOCK + select RELAY + select DEBUG_FS + select TRACEPOINTS + select GENERIC_TRACER + select STACKTRACE + help + Say Y here if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: + + git://git.kernel.dk/blktrace.git + + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + + If unsure, say N. + +config KPROBE_EVENT + depends on KPROBES + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable kprobes-based dynamic events" + select TRACING + select PROBE_EVENTS + default y + help + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. + +config UPROBE_EVENT + bool "Enable uprobes-based dynamic events" + depends on ARCH_SUPPORTS_UPROBES + depends on MMU + depends on PERF_EVENTS + select UPROBES + select PROBE_EVENTS + select TRACING + default n + help + This allows the user to add tracing events on top of userspace + dynamic events (similar to tracepoints) on the fly via the trace + events interface. Those events can be inserted wherever uprobes + can probe, and record various registers. + This option is required if you plan to use perf-probe subcommand + of perf tools on user space applications. + +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + +config PROBE_EVENTS + def_bool n + +config DYNAMIC_FTRACE + bool "enable/disable function tracing dynamically" + depends on FUNCTION_TRACER + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace + + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. + +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. + + If in doubt, say N. + +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on GENERIC_TRACER + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && PCI + select GENERIC_TRACER + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/trace/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N. + +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statics and more checks are done. + It will output the stats of each per cpu buffer. What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + +config TRACE_ENUM_MAP_FILE + bool "Show enum mappings for trace events" + depends on TRACING + help + The "print fmt" of the trace events will show the enum names instead + of their values. This can cause problems for user space tools that + use this string to parse the raw data as user space does not know + how to convert the string to its value. + + To fix this, there's a special macro in the kernel that can be used + to convert the enum into its value. If this macro is used, then the + print fmt strings will have the enums converted to their values. + + If something does not get converted properly, this option can be + used to show what enums the kernel tried to convert. + + This option is for debugging the enum conversions. A file is created + in the tracing directory called "enum_map" that will show the enum + names matched with their values and what trace event system they + belong too. + + Normally, the mapping of the strings to values will be freed after + boot up or module load. With this option, they will not be freed, as + they are needed for the "enum_map" file. Enabling this option will + increase the memory footprint of the running kernel. + + If unsure, say N + +endif # FTRACE + +endif # TRACING_SUPPORT + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 000000000..9b1044e93 --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,70 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FUNCTION_TRACER +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) + +ifdef CONFIG_FTRACE_SELFTEST +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) +obj-y += trace_selftest_dynamic.o +endif +endif + +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +endif + +CFLAGS_trace_benchmark.o := -I$(src) +CFLAGS_trace_events_filter.o := -I$(src) + +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o + +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o +obj-$(CONFIG_TRACING) += trace_stat.o +obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o +obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif +obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o + +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c new file mode 100644 index 000000000..483cecfa5 --- /dev/null +++ b/kernel/trace/blktrace.c @@ -0,0 +1,1819 @@ +/* + * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/blktrace_api.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/time.h> +#include <linux/uaccess.h> +#include <linux/list.h> + +#include <trace/events/block.h> + +#include "trace_output.h" + +#ifdef CONFIG_BLK_DEV_IO_TRACE + +static unsigned int blktrace_seq __read_mostly = 1; + +static struct trace_array *blk_tr; +static bool blk_tracer_enabled __read_mostly; + +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + +/* Global reference count of probes */ +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static void blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + +/* + * Send out a notify message. + */ +static void trace_note(struct blk_trace *bt, pid_t pid, int action, + const void *data, size_t len) +{ + struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; + int pc = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + + if (blk_tracer) { + buffer = blk_tr->trace_buffer.buffer; + pc = preempt_count(); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + if (!bt->rchan) + return; + + t = relay_reserve(bt->rchan, sizeof(*t) + len); + if (t) { + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->time = ktime_to_ns(ktime_get()); +record_it: + t->device = bt->dev; + t->action = action; + t->pid = pid; + t->cpu = cpu; + t->pdu_len = len; + memcpy((void *) t + sizeof(*t), data, len); + + if (blk_tracer) + trace_buffer_unlock_commit(buffer, event, 0, pc); + } +} + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct task_struct *tsk) +{ + unsigned long flags; + struct blk_trace *bt; + + tsk->btrace_seq = blktrace_seq; + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); +} + +static void trace_note_time(struct blk_trace *bt) +{ + struct timespec now; + unsigned long flags; + u32 words[2]; + + getnstimeofday(&now); + words[0] = now.tv_sec; + words[1] = now.tv_nsec; + + local_irq_save(flags); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + local_irq_restore(flags); +} + +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + unsigned long flags; + char *buf; + + if (unlikely(bt->trace_state != Blktrace_running && + !blk_tracer_enabled)) + return; + + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + + local_irq_save(flags); + buf = this_cpu_ptr(bt->msg_data); + va_start(args, fmt); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; + +#define BLK_TC_RAHEAD BLK_TC_AHEAD + +/* The ilog2() calls fall out because they're constant */ +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ + struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; + pid_t pid; + int cpu, pc = 0; + bool blk_tracer = blk_tracer_enabled; + + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) + return; + + what |= ddir_act[rw & WRITE]; + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); + what |= MASK_TC_BIT(rw, META); + what |= MASK_TC_BIT(rw, DISCARD); + what |= MASK_TC_BIT(rw, FLUSH); + what |= MASK_TC_BIT(rw, FUA); + + pid = tsk->pid; + if (act_log_check(bt, what, sector, pid)) + return; + cpu = raw_smp_processor_id(); + + if (blk_tracer) { + tracing_record_cmdline(current); + + buffer = blk_tr->trace_buffer.buffer; + pc = preempt_count(); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + pdu_len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. + */ + local_irq_save(flags); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + if (t) { + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = bt->dev; + t->error = error; + t->pdu_len = pdu_len; + + if (pdu_len) + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tracer) { + trace_buffer_unlock_commit(buffer, event, 0, pc); + return; + } + } + + local_irq_restore(flags); +} + +static struct dentry *blk_tree_root; +static DEFINE_MUTEX(blk_tree_mutex); + +static void blk_trace_free(struct blk_trace *bt) +{ + debugfs_remove(bt->msg_file); + debugfs_remove(bt->dropped_file); + relay_close(bt->rchan); + debugfs_remove(bt->dir); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + kfree(bt); +} + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + blk_trace_free(bt); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); +} + +int blk_trace_remove(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (!bt) + return -EINVAL; + + if (bt->trace_state != Blktrace_running) + blk_trace_cleanup(bt); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_remove); + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static const struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = blk_dropped_read, + .llseek = default_llseek, +}; + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count >= BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count + 1, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + msg[count] = '\0'; + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = blk_msg_write, + .llseek = noop_llseek, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + umode_t mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + +/* + * Setup everything required to start tracing + */ +int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) +{ + struct blk_trace *old_bt, *bt = NULL; + struct dentry *dir = NULL; + int ret, i; + + if (!buts->buf_size || !buts->buf_nr) + return -EINVAL; + + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); + buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + for (i = 0; i < strlen(buts->name); i++) + if (buts->name[i] == '/') + buts->name[i] = '_'; + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + ret = -ENOMEM; + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto err; + + ret = -ENOENT; + + mutex_lock(&blk_tree_mutex); + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) { + mutex_unlock(&blk_tree_mutex); + goto err; + } + } + mutex_unlock(&blk_tree_mutex); + + dir = debugfs_create_dir(buts->name, blk_tree_root); + + if (!dir) + goto err; + + bt->dir = dir; + bt->dev = dev; + atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); + + ret = -EIO; + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); + if (!bt->dropped_file) + goto err; + + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + + bt->rchan = relay_open("trace", dir, buts->buf_size, + buts->buf_nr, &blk_relay_callbacks, bt); + if (!bt->rchan) + goto err; + + bt->act_mask = buts->act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + blk_trace_setup_lba(bt, bdev); + + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + + bt->pid = buts->pid; + bt->trace_state = Blktrace_setup; + + ret = -EBUSY; + old_bt = xchg(&q->blk_trace, bt); + if (old_bt) { + (void) xchg(&q->blk_trace, old_bt); + goto err; + } + + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + + return 0; +err: + blk_trace_free(bt); + return ret; +} + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + int ret; + + ret = copy_from_user(&buts, arg, sizeof(buts)); + if (ret) + return -EFAULT; + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts, sizeof(buts))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_setup); + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + struct blk_trace *bt = q->blk_trace; + + if (bt == NULL) + return -EINVAL; + + /* + * For starting a trace, we can transition from a setup or stopped + * trace. For stopping a trace, the state must be running + */ + ret = -EINVAL; + if (start) { + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) { + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); + + trace_note_time(bt); + ret = 0; + } + } else { + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + ret = 0; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_startstop); + +/** + * blk_trace_ioctl: - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + struct request_queue *q; + int ret, start = 0; + char b[BDEVNAME_SIZE]; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + mutex_lock(&bdev->bd_mutex); + + switch (cmd) { + case BLKTRACESETUP: + bdevname(bdev, b); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + bdevname(bdev, b); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif + case BLKTRACESTART: + start = 1; + case BLKTRACESTOP: + ret = blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&bdev->bd_mutex); + return ret; +} + +/** + * blk_trace_shutdown: - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(struct request_queue *q) +{ + if (q->blk_trace) { + blk_trace_startstop(q, 0); + blk_trace_remove(q); + } +} + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @nr_bytes: number of completed bytes + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + unsigned int nr_bytes, u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, + what, rq->errors, rq->cmd_len, rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, + rq->cmd_flags, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(void *ignore, + struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(void *ignore, + struct request_queue *q, + struct request *rq, + unsigned int nr_bytes) +{ + blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * @error: error, if any + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what, int error) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, what, error, 0, NULL); +} + +static void blk_add_trace_bio_bounce(void *ignore, + struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); +} + +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); +} + +static void blk_add_trace_bio_backmerge(void *ignore, + struct request_queue *q, + struct request *rq, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); +} + +static void blk_add_trace_bio_frontmerge(void *ignore, + struct request_queue *q, + struct request *rq, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); +} + +static void blk_add_trace_bio_queue(void *ignore, + struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); +} + +static void blk_add_trace_getrq(void *ignore, + struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(void *ignore, + struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); + } +} + +static void blk_add_trace_plug(void *ignore, struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(depth); + u32 what; + + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; + + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(void *ignore, + struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, + !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation + * @ignore: trace callback data parameter (not used) + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +} + +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(void *ignore, + struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); + else + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static void blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); + WARN_ON(ret); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + WARN_ON(ret); +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); + + tracepoint_synchronize_unregister(); +} + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + int tc = t->action >> BLK_TC_SHIFT; + + if (t->action == BLK_TN_MESSAGE) { + rwbs[i++] = 'N'; + goto out; + } + + if (tc & BLK_TC_FLUSH) + rwbs[i++] = 'F'; + + if (tc & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (tc & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (tc & BLK_TC_FUA) + rwbs[i++] = 'F'; + if (tc & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (tc & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (tc & BLK_TC_META) + rwbs[i++] = 'M'; +out: + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->error; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector_from = __r->sector_from; + + r->device_from = be32_to_cpu(__r->device_from); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); +} + +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); + +static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +{ + char rwbs[RWBS_LEN]; + unsigned long long ts = iter->ts; + unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + + trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); +} + +static void blk_log_action(struct trace_iterator *iter, const char *act) +{ + char rwbs[RWBS_LEN]; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const unsigned char *pdu_buf; + int pdu_len; + int i, end; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + trace_seq_putc(s, '('); + + for (i = 0; i < pdu_len; i++) { + + trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) { + trace_seq_puts(s, " ..) "); + return; + } + } + + trace_seq_puts(s, ") "); +} + +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + trace_seq_printf(s, "%u ", t_bytes(ent)); + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + else + trace_seq_printf(s, "[%s]\n", cmd); + } +} + +static void blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) +{ + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%d]\n", t_error(ent)); + } else { + if (t_sec(ent)) + trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + else + trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } +} + +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device_from = 0, }; + + get_pdu_remap(ent, &r); + trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); +} + +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "[%s]\n", cmd); +} + +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); +} + +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), cmd); +} + +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +{ + const struct blk_io_trace *t = te_blk_io_trace(ent); + + trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putc(s, '\n'); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + blk_tracer_enabled = true; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + blk_tracer_enabled = false; +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + blk_tracer_stop(tr); +} + +static const struct { + const char *act[2]; + void (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t print_one_line(struct trace_iterator *iter, + bool classic) +{ + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t; + u16 what; + bool long_act; + blk_log_action_t *log_action; + + t = te_blk_io_trace(iter->ent); + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + log_action = classic ? &blk_log_action_classic : &blk_log_action; + + if (t->action == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m"); + blk_log_msg(s, iter->ent); + } + + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) + trace_seq_printf(s, "Unknown action %x\n", what); + else { + log_action(iter, what2act[what].act[long_act]); + what2act[what].print(s, iter->ent); + } + + return trace_handle_return(s); +} + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return print_one_line(iter, false); +} + +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = iter->ts, + }; + + trace_seq_putmem(s, &old, offset); + trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + blk_trace_synthesize_old_trace(iter); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + return print_one_line(iter, true); +} + +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, +}; + +static struct trace_event_functions trace_blk_event_funcs = { + .trace = blk_trace_event_print, + .binary = blk_trace_event_print_binary, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + + blk_trace_free(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret = -ENOMEM; + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto free_bt; + + bt->dev = bdev->bd_dev; + bt->act_mask = (u16)-1; + + blk_trace_setup_lba(bt, bdev); + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + ret = -EBUSY; + goto free_bt; + } + + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; + +free_bt: + blk_trace_free(bt); + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static BLK_TRACE_DEVICE_ATTR(enable); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_FLUSH, "flush" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, + { BLK_TC_FUA, "fua" }, +}; + +static int blk_trace_str2mask(const char *str) +{ + int i; + int mask = 0; + char *buf, *s, *token; + + buf = kstrdup(str, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + s = strstrip(buf); + + while (1) { + token = strsep(&s, ","); + if (token == NULL) + break; + + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } + } + kfree(buf); + + return mask; +} + +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + +static struct request_queue *blk_trace_get_queue(struct block_device *bdev) +{ + if (bdev->bd_disk == NULL) + return NULL; + + return bdev_get_queue(bdev); +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out; + + q = blk_trace_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!q->blk_trace); + goto out_unlock_bdev; + } + + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + +out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -EINVAL; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + ret = blk_trace_str2mask(buf); + if (ret < 0) + goto out; + value = ret; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + ret = -ENXIO; + + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out; + + q = blk_trace_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + if (value) + ret = blk_trace_setup_queue(q, bdev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + } + +out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out: + return ret ? ret : count; +} + +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & REQ_FLUSH) + rwbs[i++] = 'F'; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & REQ_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & REQ_FUA) + rwbs[i++] = 'F'; + if (rw & REQ_RAHEAD) + rwbs[i++] = 'A'; + if (rw & REQ_SYNC) + rwbs[i++] = 'S'; + if (rw & REQ_META) + rwbs[i++] = 'M'; + if (rw & REQ_SECURE) + rwbs[i++] = 'E'; + + rwbs[i] = '\0'; +} +EXPORT_SYMBOL_GPL(blk_fill_rwbs); + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000..2d56ce501 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,222 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 000000000..02bece4a9 --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,5951 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com> + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ + +#include <linux/stop_machine.h> +#include <linux/clocksource.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/suspend.h> +#include <linux/tracefs.h> +#include <linux/hardirq.h> +#include <linux/kthread.h> +#include <linux/uaccess.h> +#include <linux/bsearch.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/sort.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/rcupdate.h> + +#include <trace/events/sched.h> + +#include <asm/setup.h> + +#include "trace_output.h" +#include "trace_stat.h" + +#define FTRACE_WARN_ON(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +#define FTRACE_WARN_ON_ONCE(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +/* hash bits for specific function selection */ +#define FTRACE_HASH_BITS 7 +#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 + +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) + +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#else +#define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) +#endif + +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) +}; + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int last_ftrace_enabled; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; + +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +static DEFINE_MUTEX(ftrace_lock); + +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; + +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); + +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw_notrace() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_notrace(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; + ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif +} + +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; +} + +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + if (!test_tsk_trace_trace(current)) + return; + + ftrace_pid_function(ip, parent_ip, op, regs); +} + +static void set_ftrace_pid_function(ftrace_func_t func) +{ + /* do not set ftrace_pid_function to itself! */ + if (func != ftrace_pid_func) + ftrace_pid_function = func; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; + ftrace_pid_function = ftrace_stub; +} + +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + + +static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + return ftrace_ops_get_func(ops); +} + +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + /* + * Prepare the ftrace_ops that the arch callback will use. + * If there's only one ftrace_ops registered, the ftrace_ops_list + * will point to the ops we want. + */ + set_function_trace_op = ftrace_ops_list; + + /* If there's no ftrace_ops registered, just call the stub function */ + if (ftrace_ops_list == &ftrace_list_end) { + func = ftrace_stub; + + /* + * If we are at the end of the list and this ops is + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. + */ + } else if (ftrace_ops_list->next == &ftrace_list_end) { + func = ftrace_ops_get_list_func(ftrace_ops_list); + + } else { + /* Just use the default ftrace_ops */ + set_function_trace_op = &ftrace_list_end; + func = ftrace_ops_list_func; + } + + update_function_graph_func(); + + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coorditate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + schedule_on_each_cpu(ftrace_sync); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + + ftrace_trace_function = func; +} + +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; +} + +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ + ops->next = *list; + /* + * We are entering ops into the list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the list. + */ + rcu_assign_pointer(*list, ops); +} + +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (*list == ops && ops->next == &ftrace_list_end) { + *list = &ftrace_list_end; + return 0; + } + + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) + return -1; + + *p = (*p)->next; + return 0; +} + +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops); + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_DELETED) + return -EINVAL; + + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + + if (!core_kernel_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + + if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); + /* The control_ops needs the trampoline update */ + ops = &control_ops; + } else + add_ftrace_ops(&ftrace_ops_list, ops); + + ftrace_update_trampoline(ops); + + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} + +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; + + if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + + if (ret < 0) + return ret; + + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} + +static void ftrace_update_pid_func(void) +{ + /* Only do something if we are tracing something */ + if (ftrace_trace_function == ftrace_stub) + return; + + update_ftrace_function(); +} + +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; + unsigned long long time_squared; +#endif +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + if (idx != 0) + rec++; + + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_puts(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); +#else + seq_puts(m, " Function Hit\n" + " -------- ---\n"); +#endif + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; + int ret = 0; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + unsigned long long avg; + unsigned long long stddev; +#endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_puts(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + /* Sample standard deviation (s^2) */ + if (rec->counter <= 1) + stddev = 0; + else { + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + + /* + * Divide only 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide 1000 again. + */ + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); + } + + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(stddev, &s); + trace_print_seq(m, &s); +#endif + seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); + + return ret; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + + pg = stat->pages = stat->start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(stat->hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + int functions; + int pages; + int i; + + /* If we already allocated, do nothing */ + if (stat->pages) + return 0; + + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) + return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + + pg = stat->start = stat->pages; + + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 1; i < pages; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!pg->next) + goto out_free; + pg = pg->next; + } + + return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ + struct ftrace_profile_stat *stat; + int size; + + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(stat); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!stat->hash) + return -ENOMEM; + + /* Preallocate the function profiling pages */ + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; + return -ENOMEM; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_possible_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + unsigned long key; + + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); + hhd = &stat->hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion (from NMIs) */ + if (atomic_inc_return(&stat->disabled) != 1) + goto out; + + /* + * Try to find the function again since an NMI + * could have added it + */ + rec = ftrace_find_profiled_func(stat, ip); + if (rec) + goto out; + + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; + } + + rec = &stat->pages->records[stat->pages->index++]; + rec->ip = ip; + ftrace_add_profile(stat, rec); + + out: + atomic_dec(&stat->disabled); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct ftrace_profile_stat *stat; + struct ftrace_profile *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + + stat = this_cpu_ptr(&ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); + if (!rec) { + rec = ftrace_profile_alloc(stat, ip); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0, NULL, NULL); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + struct ftrace_profile_stat *stat; + unsigned long long calltime; + struct ftrace_profile *rec; + unsigned long flags; + + local_irq_save(flags); + stat = this_cpu_ptr(&ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + /* If the calltime was zero'd ignore it */ + if (!trace->calltime) + goto out; + + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + + rec = ftrace_find_profiled_func(stat, trace->func); + if (rec) { + rec->time += calltime; + rec->time_squared += calltime * calltime; + } + + out: + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = { + .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_OPS_HASH(ftrace_profile_ops) +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ + unregister_ftrace_profiler(); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + *ppos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; /* big enough to hold a number */ + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, + .llseek = default_llseek, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) +{ + struct ftrace_profile_stat *stat; + struct dentry *entry; + char *name; + int ret; + int cpu; + + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } + } + + entry = tracefs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create tracefs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_graph_active; +#else +# define ftrace_graph_active 0 +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct ftrace_ops *removed_ops; + +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +# error Dynamic ftrace depends on MCOUNT_RECORD +#endif + +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_probe { + struct hlist_node node; + struct ftrace_probe_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct list_head free_list; +}; + +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; + +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, +}; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) + +static struct ftrace_ops global_ops = { + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, +}; + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + struct ftrace_ops *op; + bool ret = false; + + /* + * Some of the ops may be dynamically allocated, + * they are freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This is to check for dynamically allocated trampolines. + * Trampolines that are in kernel text will have + * core_kernel_text() return true. + */ + if (op->trampoline && op->trampoline_size) + if (addr >= op->trampoline && + addr < op->trampoline + op->trampoline_size) { + ret = true; + goto out; + } + } while_for_each_ftrace_op(op); + + out: + preempt_enable_notrace(); + + return ret; +} + +struct ftrace_page { + struct ftrace_page *next; + struct dyn_ftrace *records; + int index; + int size; +}; + +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + + if (ftrace_hash_empty(hash)) + return NULL; + + if (hash->size_bits > 0) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + struct hlist_head *hhd; + unsigned long key; + + if (hash->size_bits) + key = hash_long(entry->ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ip = ip; + __add_hash_entry(hash, entry); + + return 0; +} + +static void +free_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + if (!hash->count) + return; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + free_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + +void ftrace_free_filter(struct ftrace_ops *ops) +{ + ftrace_ops_init(ops); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + int size; + int ret; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + /* Empty hash? */ + if (ftrace_hash_empty(hash)) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + ret = add_hash_entry(new_hash, entry->ip); + if (ret < 0) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static void +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash); + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tn; + struct hlist_head *hhd; + struct ftrace_hash *new_hash; + int size = src->count; + int bits = 0; + int ret; + int i; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + /* + * If the new source is empty, just free dst and assign it + * the empty_hash. + */ + if (!src->count) { + new_hash = EMPTY_HASH; + goto update; + } + + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + return -ENOMEM; + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) { + remove_hash_entry(src, entry); + __add_hash_entry(new_hash, entry); + } + } + +update: + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ + if (enable) { + /* IPMODIFY should be updated only when filter_hash updating */ + ret = ftrace_hash_ipmodify_update(ops, new_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return ret; + } + } + + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable_modify(ops, enable); + + rcu_assign_pointer(*dst, new_hash); + + ftrace_hash_rec_enable_modify(ops, enable); + + return 0; +} + +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an emty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !ftrace_lookup_ip(hash->notrace_hash, ip)); +} + +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + struct ftrace_ops_hash hash; + int ret; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + + hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + + if (hash_contains_ip(ip, &hash)) + ret = 1; + else + ret = 0; + + return ret; +} + +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } + + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; + + if (key->flags < rec->ip) + return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; + return 0; +} + +static unsigned long ftrace_location_range(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) + return rec->ip; + } + + return 0; +} + +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip); +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(const void *start, const void *end) +{ + unsigned long ret; + + ret = ftrace_location_range((unsigned long)start, + (unsigned long)end); + + return (int)!!ret; +} + +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + bool keep_regs = false; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(ops, rec->ip, rec)) { + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + keep_regs = true; + break; + } + } + } + + return keep_regs; +} + +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int count = 0; + int all = 0; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; + if (ftrace_hash_empty(hash)) + all = 1; + } else { + inc = !inc; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (ftrace_hash_empty(hash)) + return; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. + * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || ftrace_hash_empty(other_hash))) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) + return; + + /* + * If there's only a single callback registered to a + * function, and the ops has a trampoline registered + * for it, then we can call it directly. + */ + if (ftrace_rec_count(rec) == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + /* + * If we are adding another function callback + * to this function, and the previous had a + * custom trampoline in use, then we need to go + * back to the default trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + } else { + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) + return; + rec->flags--; + + /* + * If the rec had REGS enabled and the ops that is + * being removed had REGS set, then see if there is + * still any ops for this record that wants regs. + * If not, we can stop recording them. + */ + if (ftrace_rec_count(rec) > 0 && + rec->flags & FTRACE_FL_REGS && + ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + if (!test_rec_ops_needs_regs(rec)) + rec->flags &= ~FTRACE_FL_REGS; + } + + /* + * If the rec had TRAMP enabled, then it needs to + * be cleared. As TRAMP can only be enabled iff + * there is only a single ops attached to it. + * In otherwords, always disable it on decrementing. + * In the future, we may set it if rec count is + * decremented to one, and the ops that is left + * has a trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* + * flags will be cleared in ftrace_check_record() + * if rec count is zero. + */ + } + count++; + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return; + } while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 1); +} + +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + +/* + * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK + * or no-needed to update, -EBUSY if it detects a conflict of the flag + * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. + * Note that old_hash and new_hash has below meanings + * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) + * - If the hash is EMPTY_HASH, it hits nothing + * - Anything else hits the recs which match the hash entries. + */ +static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, + struct ftrace_hash *old_hash, + struct ftrace_hash *new_hash) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec, *end = NULL; + int in_old, in_new; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + /* + * Since the IPMODIFY is a very address sensitive action, we do not + * allow ftrace_ops to set all functions to new hash. + */ + if (!new_hash || !old_hash) + return -EINVAL; + + /* Update rec->flags */ + do_for_each_ftrace_rec(pg, rec) { + /* We need to update only differences of filter_hash */ + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) { + /* New entries must ensure no others are using it */ + if (rec->flags & FTRACE_FL_IPMODIFY) + goto rollback; + rec->flags |= FTRACE_FL_IPMODIFY; + } else /* Removed entry */ + rec->flags &= ~FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + + return 0; + +rollback: + end = rec; + + /* Roll back what we did above */ + do_for_each_ftrace_rec(pg, rec) { + if (rec == end) + goto err_out; + + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) + rec->flags &= ~FTRACE_FL_IPMODIFY; + else + rec->flags |= FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + +err_out: + return -EBUSY; +} + +static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); +} + +/* Disabling always succeeds */ +static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); +} + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash) +{ + struct ftrace_hash *old_hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(old_hash)) + old_hash = NULL; + + if (ftrace_hash_empty(new_hash)) + new_hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); +} + +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); + +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @rec: The record that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, struct dyn_ftrace *rec) +{ + unsigned long ip = rec ? rec->ip : 0; + + switch (failed) { + case -EFAULT: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case -EINVAL: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" actual: ", (unsigned char *)ip); + pr_cont("\n"); + break; + case -EPERM: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); + } + if (rec) { + struct ftrace_ops *ops = NULL; + + pr_info("ftrace record flags: %lx\n", rec->flags); + pr_cont(" (%ld)%s", ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + pr_cont("\ttramp: %pS", + (void *)ops->trampoline); + else + pr_cont("\ttramp: ERROR!"); + + } + ip = ftrace_get_addr_curr(rec); + pr_cont(" expected tramp: %lx\n", ip); + } +} + +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +{ + unsigned long flag = 0UL; + + /* + * If we are updating calls: + * + * If the record has a ref count, then we need to enable it + * because someone is using it. + * + * Otherwise we make sure its disabled. + * + * If we are disabling calls, then disable all records that + * are enabled. + */ + if (enable && ftrace_rec_count(rec)) + flag = FTRACE_FL_ENABLED; + + /* + * If enabling and the REGS flag does not match the REGS_EN, or + * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore + * this record. Set flags to fail the compare against ENABLED. + */ + if (flag) { + if (!(rec->flags & FTRACE_FL_REGS) != + !(rec->flags & FTRACE_FL_REGS_EN)) + flag |= FTRACE_FL_REGS; + + if (!(rec->flags & FTRACE_FL_TRAMP) != + !(rec->flags & FTRACE_FL_TRAMP_EN)) + flag |= FTRACE_FL_TRAMP; + } + + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return FTRACE_UPDATE_IGNORE; + + if (flag) { + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { + rec->flags |= FTRACE_FL_ENABLED; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + if (flag & FTRACE_FL_TRAMP) { + if (rec->flags & FTRACE_FL_TRAMP) + rec->flags |= FTRACE_FL_TRAMP_EN; + else + rec->flags &= ~FTRACE_FL_TRAMP_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function or + * vice versa, or from a trampoline call. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; + } + + if (update) { + /* If there's no more users, clear all flags */ + if (!ftrace_rec_count(rec)) + rec->flags = 0; + else + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN); + } + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) + return removed_ops; + } + + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) + continue; + + + /* + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. + */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) + return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; + + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* pass rec in as regs to have non-NULL val */ + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP) { + ops = ftrace_find_tramp_ops_new(rec); + if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_curr(rec); + if (FTRACE_WARN_ON(!ops)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_old_addr; + unsigned long ftrace_addr; + int ret; + + ftrace_addr = ftrace_get_addr_new(rec); + + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_old_addr); + + case FTRACE_UPDATE_MODIFY_CALL: + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); + } + + return -1; /* unknow ftrace bug */ +} + +void __weak ftrace_replace_code(int enable) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int failed; + + if (unlikely(ftrace_disabled)) + return; + + do_for_each_ftrace_rec(pg, rec) { + failed = __ftrace_replace_code(rec, enable); + if (failed) { + ftrace_bug(failed, rec); + /* Stop processing */ + return; + } + } while_for_each_ftrace_rec(); +} + +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + +static int +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return 0; + + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + if (ret) { + ftrace_bug(ret, rec); + return 0; + } + return 1; +} + +/* + * archs can override this function if they must do something + * before the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_prepare(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * after the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_post_process(void) +{ + return 0; +} + +void ftrace_modify_all_code(int command) +{ + int update = command & FTRACE_UPDATE_TRACE_FUNC; + int err = 0; + + /* + * If the ftrace_caller calls a ftrace_ops func directly, + * we need to make sure that it only traces functions it + * expects to trace. When doing the switch of functions, + * we need to update to the ftrace_ops_list_func first + * before the transition between old and new calls are set, + * as the ftrace_ops_list_func will check the ops hashes + * to make sure the ops are having the right functions + * traced. + */ + if (update) { + err = ftrace_update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } + + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); + err = ftrace_update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; + } + + if (command & FTRACE_START_FUNC_RET) + err = ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + ftrace_modify_all_code(*command); + + return 0; +} + +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + +static void ftrace_run_update_code(int command) +{ + int ret; + + ret = ftrace_arch_code_modify_prepare(); + FTRACE_WARN_ON(ret); + if (ret) + return; + + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + + ret = ftrace_arch_code_modify_post_process(); + FTRACE_WARN_ON(ret); +} + +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_ops_hash *old_hash) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash->filter_hash; + ops->old_hash.notrace_hash = old_hash->notrace_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; +} + +static ftrace_func_t saved_ftrace_func; +static int ftrace_start_up; + +void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + +static void ftrace_startup_enable(int command) +{ + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + return; + + ftrace_run_update_code(command); +} + +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + +static int ftrace_startup(struct ftrace_ops *ops, int command) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ret = __register_ftrace_function(ops); + if (ret) + return ret; + + ftrace_start_up++; + command |= FTRACE_UPDATE_CALLS; + + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; + + ret = ftrace_hash_ipmodify_enable(ops); + if (ret < 0) { + /* Rollback registration process */ + __unregister_ftrace_function(ops); + ftrace_start_up--; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + return ret; + } + + ftrace_hash_rec_enable(ops, 1); + + ftrace_startup_enable(command); + + ops->flags &= ~FTRACE_OPS_FL_ADDING; + + return 0; +} + +static int ftrace_shutdown(struct ftrace_ops *ops, int command) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; + + ftrace_start_up--; + /* + * Just warn in case of unbalance, no need to kill ftrace, it's not + * critical but the ftrace_call callers may be never nopped again after + * further ftrace uses. + */ + WARN_ON_ONCE(ftrace_start_up < 0); + + /* Disabling ipmodify never fails */ + ftrace_hash_ipmodify_disable(ops); + ftrace_hash_rec_disable(ops, 1); + + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + + command |= FTRACE_UPDATE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + return 0; + } + + /* + * If the ops uses a trampoline, then it needs to be + * tested first on update. + */ + ops->flags |= FTRACE_OPS_FL_REMOVING; + removed_ops = ops; + + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + + ftrace_run_update_code(command); + + /* + * If there's no more ops registered with ftrace, run a + * sanity check to make sure all rec flags are cleared. + */ + if (ftrace_ops_list == &ftrace_list_end) { + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (FTRACE_WARN_ON_ONCE(rec->flags)) + pr_warn(" %pS flags:%lx\n", + (void *)rec->ip, rec->flags); + } while_for_each_ftrace_rec(); + } + + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + + removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + arch_ftrace_trampoline_free(ops); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + + return 0; +} + +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} + +static cycle_t ftrace_update_time; +unsigned long ftrace_update_tot_cnt; + +static inline int ops_traces_mod(struct ftrace_ops *ops) +{ + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + /* If ops traces all mods, we already accounted for it */ + if (ops_traces_mod(ops)) + return 0; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + return 0; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + return 0; + + return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) +{ + struct ftrace_page *pg; + struct dyn_ftrace *p; + cycle_t start, stop; + unsigned long update_cnt = 0; + unsigned long ref = 0; + bool test = false; + int i; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + if (ops_traces_mod(ops)) + ref++; + else + test = true; + } + } + } + + start = ftrace_now(raw_smp_processor_id()); + + for (pg = new_pgs; pg; pg = pg->next) { + + for (i = 0; i < pg->index; i++) { + int cnt = ref; + + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; + + p = &pg->records[i]; + if (test) + cnt += referenced_filters(p); + p->flags = cnt; + + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; + + update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p); + } + } + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += update_cnt; + + return 0; +} + +static int ftrace_allocate_records(struct ftrace_page *pg, int count) +{ + int order; + int cnt; + + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); + + /* + * We want to fill as much as possible. No more than a page + * may be empty. + */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; + + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } + + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; + + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) + break; + + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + + pg = pg->next; + } + + return start_pg; + + free_pages: + pg = start_pg; + while (pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; +} + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + struct ftrace_hash *hash; + struct ftrace_ops *ops; + int hidx; + int idx; + unsigned flags; +}; + +static void * +t_hash_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct hlist_node *hnd = NULL; + struct hlist_head *hhd; + + (*pos)++; + iter->pos = *pos; + + if (iter->probe) + hnd = &iter->probe->node; + retry: + if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + return NULL; + + hhd = &ftrace_func_hash[iter->hidx]; + + if (hlist_empty(hhd)) { + iter->hidx++; + hnd = NULL; + goto retry; + } + + if (!hnd) + hnd = hhd->first; + else { + hnd = hnd->next; + if (!hnd) { + iter->hidx++; + goto retry; + } + } + + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; +} + +static void *t_hash_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + + if (iter->func_pos > *pos) + return NULL; + + iter->hidx = 0; + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); + if (!p) + break; + } + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; +} + +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_func_probe *rec; + + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; + + if (rec->ops->print) + return rec->ops->print(m, rec->ip, rec->ops, rec->data); + + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); + + if (rec->data) + seq_printf(m, ":%p", rec->data); + seq_putc(m, '\n'); + + return 0; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = iter->ops; + struct dyn_ftrace *rec = NULL; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_next(m, pos); + + (*pos)++; + iter->pos = iter->func_pos = *pos; + + if (iter->flags & FTRACE_ITER_PRINTALL) + return t_hash_start(m, pos); + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if (((iter->flags & FTRACE_ITER_FILTER) && + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || + + ((iter->flags & FTRACE_ITER_NOTRACE) && + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & FTRACE_FL_ENABLED))) { + + rec = NULL; + goto retry; + } + } + + if (!rec) + return t_hash_start(m, pos); + + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = iter->ops; + void *p = NULL; + loff_t l; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + + /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* + * For set_ftrace_filter reading, if we have the filter + * off, we can short cut and just print out that all + * functions are enabled. + */ + if ((iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->func_hash->filter_hash)) || + (iter->flags & FTRACE_ITER_NOTRACE && + ftrace_hash_empty(ops->func_hash->notrace_hash))) { + if (*pos > 0) + return t_hash_start(m, pos); + iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; + return iter; + } + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_start(m, pos); + + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; + } + + if (!p) + return t_hash_start(m, pos); + + return iter; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +void * __weak +arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return NULL; +} + +static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + void *ptr; + + ptr = arch_ftrace_trampoline_func(ops, rec); + if (ptr) + seq_printf(m, " ->%pS", ptr); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec; + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_show(m, iter); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + if (iter->flags & FTRACE_ITER_NOTRACE) + seq_puts(m, "#### no functions disabled ####\n"); + else + seq_puts(m, "#### all functions enabled ####\n"); + return 0; + } + + rec = iter->func; + + if (!rec) + return 0; + + seq_printf(m, "%ps", (void *)rec->ip); + if (iter->flags & FTRACE_ITER_ENABLED) { + struct ftrace_ops *ops = NULL; + + seq_printf(m, " (%ld)%s%s", + ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + seq_printf(m, "\ttramp: %pS", + (void *)ops->trampoline); + else + seq_puts(m, "\ttramp: ERROR!"); + + } + add_trampoline_func(m, ops, rec); + } + + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; + } + + return iter ? 0 : -ENOMEM; +} + +static int +ftrace_enabled_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; + } + + return iter ? 0 : -ENOMEM; +} + +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * tracing_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int +ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + struct ftrace_hash *hash; + int ret = 0; + + ftrace_ops_init(ops); + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + + iter->ops = ops; + iter->flags = flag; + + mutex_lock(&ops->func_hash->regex_lock); + + if (flag & FTRACE_ITER_NOTRACE) + hash = ops->func_hash->notrace_hash; + else + hash = ops->func_hash->filter_hash; + + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) + iter->hash = alloc_ftrace_hash(size_bits); + else + iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + + if (!iter->hash) { + trace_parser_put(&iter->parser); + kfree(iter); + ret = -ENOMEM; + goto out_unlock; + } + } + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else { + /* Failed */ + free_ftrace_hash(iter->hash); + trace_parser_put(&iter->parser); + kfree(iter); + } + } else + file->private_data = iter; + + out_unlock: + mutex_unlock(&ops->func_hash->regex_lock); + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, + inode, file); +} + +static int ftrace_match(char *str, char *regex, int len, int type) +{ + int matched = 0; + int slen; + + switch (type) { + case MATCH_FULL: + if (strcmp(str, regex) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (strncmp(str, regex, len) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, regex)) + matched = 1; + break; + case MATCH_END_ONLY: + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) + matched = 1; + break; + } + + return matched; +} + +static int +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +{ + struct ftrace_func_entry *entry; + int ret = 0; + + entry = ftrace_lookup_ip(hash, rec->ip); + if (not) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; + + free_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; + + ret = add_hash_entry(hash, rec->ip); + } + return ret; +} + +static int +ftrace_match_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) +{ + char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (mod) { + /* module lookup requires matching the module */ + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (!len) + return 1; + } + + return ftrace_match(str, regex, len, type); +} + +static int +match_records(struct ftrace_hash *hash, char *buff, + int len, char *mod, int not) +{ + unsigned search_len = 0; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + char *search = buff; + int found = 0; + int ret; + + if (len) { + type = filter_parse_regex(buff, len, &search, ¬); + search_len = strlen(search); + } + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { + if (ftrace_match_record(rec, mod, search, search_len, type)) { + ret = enter_record(hash, rec, not); + if (ret < 0) { + found = ret; + goto out_unlock; + } + found = 1; + } + } while_for_each_ftrace_rec(); + out_unlock: + mutex_unlock(&ftrace_lock); + + return found; +} + +static int +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) +{ + return match_records(hash, buff, len, NULL, 0); +} + +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) +{ + int not = 0; + + /* blank or '*' mean the same */ + if (strcmp(buff, "*") == 0) + buff[0] = 0; + + /* handle the case of 'dont filter this module' */ + if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { + buff[0] = 0; + not = 1; + } + + return match_records(hash, buff, strlen(buff), mod, not); +} + +/* + * We register the module command as a template to show others how + * to register the a command as well. + */ + +static int +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) +{ + char *mod; + int ret = -EINVAL; + + /* + * cmd == 'mod' because we only registered this func + * for the 'mod' ftrace_func_command. + * But if you register one func with multiple commands, + * you can tell which command was used by the cmd + * parameter. + */ + + /* we must have a module name */ + if (!param) + return ret; + + mod = strsep(¶m, ":"); + if (!strlen(mod)) + return ret; + + ret = ftrace_match_module_records(hash, func, mod); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + + return 0; +} + +static struct ftrace_func_command ftrace_mod_cmd = { + .name = "mod", + .func = ftrace_mod_callback, +}; + +static int __init ftrace_mod_cmd_init(void) +{ + return register_ftrace_command(&ftrace_mod_cmd); +} +core_initcall(ftrace_mod_cmd_init); + +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct ftrace_func_probe *entry; + struct hlist_head *hhd; + unsigned long key; + + key = hash_long(ip, FTRACE_HASH_BITS); + + hhd = &ftrace_func_hash[key]; + + if (hlist_empty(hhd)) + return; + + /* + * Disable preemption for these calls to prevent a RCU grace + * period. This syncs the hash iteration and freeing of items + * on the hash. rcu_read_lock is too dangerous here. + */ + preempt_disable_notrace(); + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { + if (entry->ip == ip) + entry->ops->func(ip, parent_ip, &entry->data); + } + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_probe_ops __read_mostly = +{ + .func = function_trace_probe_call, + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_OPS_HASH(trace_probe_ops) +}; + +static int ftrace_probe_registered; + +static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) +{ + int ret; + int i; + + if (ftrace_probe_registered) { + /* still need to update the function call sites */ + if (ftrace_enabled) + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); + return; + } + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + break; + } + /* Nothing registered? */ + if (i == FTRACE_FUNC_HASHSIZE) + return; + + ret = ftrace_startup(&trace_probe_ops, 0); + + ftrace_probe_registered = 1; +} + +static void __disable_ftrace_function_probe(void) +{ + int i; + + if (!ftrace_probe_registered) + return; + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + return; + } + + /* no more funcs left */ + ftrace_shutdown(&trace_probe_ops, 0); + + ftrace_probe_registered = 0; +} + + +static void ftrace_free_entry(struct ftrace_func_probe *entry) +{ + if (entry->ops->free) + entry->ops->free(entry->ops, entry->ip, &entry->data); + kfree(entry); +} + +int +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_hash *hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type, len, not; + unsigned long key; + int count = 0; + char *search; + int ret; + + type = filter_parse_regex(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function probes */ + if (WARN_ON(not)) + return -EINVAL; + + mutex_lock(&trace_probe_ops.func_hash->regex_lock); + + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + count = -ENOMEM; + goto out; + } + + if (unlikely(ftrace_disabled)) { + count = -ENODEV; + goto out; + } + + mutex_lock(&ftrace_lock); + + do_for_each_ftrace_rec(pg, rec) { + + if (!ftrace_match_record(rec, NULL, search, len, type)) + continue; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + /* If we did not process any, then return error */ + if (!count) + count = -ENOMEM; + goto out_unlock; + } + + count++; + + entry->data = data; + + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (ops->init) { + if (ops->init(ops, rec->ip, &entry->data) < 0) { + /* caller does not like this func */ + kfree(entry); + continue; + } + } + + ret = enter_record(hash, rec, 0); + if (ret < 0) { + kfree(entry); + count = ret; + goto out_unlock; + } + + entry->ops = ops; + entry->ip = rec->ip; + + key = hash_long(entry->ip, FTRACE_HASH_BITS); + hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + + } while_for_each_ftrace_rec(); + + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(&old_hash_ops); + + if (!ret) + free_ftrace_hash_rcu(old_hash); + else + count = ret; + + out_unlock: + mutex_unlock(&ftrace_lock); + out: + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + free_ftrace_hash(hash); + + return count; +} + +enum { + PROBE_TEST_FUNC = 1, + PROBE_TEST_DATA = 2 +}; + +static void +__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data, int flags) +{ + struct ftrace_func_entry *rec_entry; + struct ftrace_func_probe *entry; + struct ftrace_func_probe *p; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; + struct list_head free_list; + struct ftrace_hash *hash; + struct hlist_node *tmp; + char str[KSYM_SYMBOL_LEN]; + int type = MATCH_FULL; + int i, len = 0; + char *search; + int ret; + + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) + glob = NULL; + else if (glob) { + int not; + + type = filter_parse_regex(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function probes */ + if (WARN_ON(not)) + return; + } + + mutex_lock(&trace_probe_ops.func_hash->regex_lock); + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + /* Hmm, should report this somehow */ + goto out_unlock; + + INIT_LIST_HEAD(&free_list); + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + + hlist_for_each_entry_safe(entry, tmp, hhd, node) { + + /* break up if statements for readability */ + if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) + continue; + + if ((flags & PROBE_TEST_DATA) && entry->data != data) + continue; + + /* do this last, since it is the most expensive */ + if (glob) { + kallsyms_lookup(entry->ip, NULL, NULL, + NULL, str); + if (!ftrace_match(str, glob, len, type)) + continue; + } + + rec_entry = ftrace_lookup_ip(hash, entry->ip); + /* It is possible more than one entry had this ip */ + if (rec_entry) + free_hash_entry(hash, rec_entry); + + hlist_del_rcu(&entry->node); + list_add(&entry->free_list, &free_list); + } + } + mutex_lock(&ftrace_lock); + __disable_ftrace_function_probe(); + /* + * Remove after the disable is called. Otherwise, if the last + * probe is removed, a null hash means *all enabled*. + */ + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + synchronize_sched(); + if (!ret) + free_ftrace_hash_rcu(old_hash); + + list_for_each_entry_safe(entry, p, &free_list, free_list) { + list_del(&entry->free_list); + ftrace_free_entry(entry); + } + mutex_unlock(&ftrace_lock); + + out_unlock: + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + free_ftrace_hash(hash); +} + +void +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data) +{ + __unregister_ftrace_function_probe(glob, ops, data, + PROBE_TEST_FUNC | PROBE_TEST_DATA); +} + +void +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) +{ + __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); +} + +void unregister_ftrace_function_probe_all(char *glob) +{ + __unregister_ftrace_function_probe(glob, NULL, NULL, 0); +} + +static LIST_HEAD(ftrace_commands); +static DEFINE_MUTEX(ftrace_cmd_mutex); + +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p; + int ret = 0; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &ftrace_commands); + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static int ftrace_process_regex(struct ftrace_hash *hash, + char *buff, int len, int enable) +{ + char *func, *command, *next = buff; + struct ftrace_func_command *p; + int ret = -EINVAL; + + func = strsep(&next, ":"); + + if (!next) { + ret = ftrace_match_records(hash, func, len); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; + } + + /* command found */ + + command = strsep(&next, ":"); + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(hash, func, command, next, enable); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + struct trace_parser *parser; + ssize_t ret, read; + + if (!cnt) + return 0; + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + /* iter->hash is a local copy, so we don't need regex_lock */ + + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(iter->hash, parser->buffer, + parser->idx, enable); + trace_parser_clear(parser); + if (ret < 0) + goto out; + } + + ret = read; + out: + return ret; +} + +ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) +{ + struct ftrace_hash **orig_hash; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + if (reset) + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + else + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + + if (!hash) { + ret = -ENOMEM; + goto out_regex_unlock; + } + + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } + + mutex_lock(&ftrace_lock); + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + mutex_unlock(&ftrace_lock); + + out_regex_unlock: + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(hash); + return ret; +} + +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_global_filter - set a function to filter on with global tracers + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); + +/** + * ftrace_set_global_notrace - set a function to not trace with global tracers + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); + +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + ftrace_filter_param = true; + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + ftrace_filter_param = true; + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); + +static unsigned long save_global_trampoline; +static unsigned long save_global_flags; + +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static int __init set_graph_notrace_function(char *str) +{ + strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static void __init set_ftrace_early_graph(char *buf, int enable) +{ + int ret; + char *func; + unsigned long *table = ftrace_graph_funcs; + int *count = &ftrace_graph_count; + + if (!enable) { + table = ftrace_graph_notrace_funcs; + count = &ftrace_graph_notrace_count; + } + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) +{ + char *func; + + ftrace_ops_init(ops); + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(ops, func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf, 1); + if (ftrace_graph_notrace_buf[0]) + set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +} + +int ftrace_regex_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct trace_parser *parser; + int filter_hash; + int ret; + + if (file->f_mode & FMODE_READ) { + iter = m->private; + seq_release(inode, file); + } else + iter = file->private_data; + + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(iter->hash, parser->buffer, parser->idx); + } + + trace_parser_put(parser); + + mutex_lock(&iter->ops->func_hash->regex_lock); + + if (file->f_mode & FMODE_WRITE) { + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) + orig_hash = &iter->ops->func_hash->filter_hash; + else + orig_hash = &iter->ops->func_hash->notrace_hash; + + mutex_lock(&ftrace_lock); + old_hash = *orig_hash; + old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret) { + ftrace_ops_update_code(iter->ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + mutex_unlock(&ftrace_lock); + } + + mutex_unlock(&iter->ops->func_hash->regex_lock); + free_ftrace_hash(iter->hash); + kfree(iter); + + return 0; +} + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +static const struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = seq_read, + .write = ftrace_notrace_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +int ftrace_graph_count; +int ftrace_graph_notrace_count; +unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { + unsigned long *table; + size_t size; + int *count; + const struct seq_operations *seq_ops; +}; + +static void * +__g_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_graph_data *fgd = m->private; + + if (*pos >= *fgd->count) + return NULL; + return &fgd->table[*pos]; +} + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_graph_data *fgd = m->private; + + mutex_lock(&graph_lock); + + /* Nothing, tell g_show to print all functions are enabled */ + if (!*fgd->count && !*pos) + return (void *)1; + + return __g_next(m, pos); +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + unsigned long *ptr = v; + + if (!ptr) + return 0; + + if (ptr == (unsigned long *)1) { + struct ftrace_graph_data *fgd = m->private; + + if (fgd->table == ftrace_graph_funcs) + seq_puts(m, "#### all functions enabled ####\n"); + else + seq_puts(m, "#### no functions disabled ####\n"); + return 0; + } + + seq_printf(m, "%ps\n", (void *)*ptr); + + return 0; +} + +static const struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) +{ + int ret = 0; + + mutex_lock(&graph_lock); + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + *fgd->count = 0; + memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + } + mutex_unlock(&graph_lock); + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, fgd->seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } + } else + file->private_data = fgd; + + return ret; +} + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_notrace_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_notrace_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + kfree(m->private); + seq_release(inode, file); + } else { + kfree(file->private_data); + } + + return 0; +} + +static int +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int search_len; + int fail = 1; + int type, not; + char *search; + bool exists; + int i; + + /* decode regex */ + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + if (!not && *idx >= size) + return -EBUSY; + + search_len = strlen(search); + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + + do_for_each_ftrace_rec(pg, rec) { + + if (ftrace_match_record(rec, NULL, search, search_len, type)) { + /* if it is in the array */ + exists = false; + for (i = 0; i < *idx; i++) { + if (array[i] == rec->ip) { + exists = true; + break; + } + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= size) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } + } + } + } while_for_each_ftrace_rec(); +out: + mutex_unlock(&ftrace_lock); + + if (fail) + return -EINVAL; + + return 0; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_parser parser; + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; + + if (!cnt) + return 0; + + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + mutex_lock(&graph_lock); + + /* we allow only one expression at a time */ + ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, + parser.buffer); + + mutex_unlock(&graph_lock); + } + + if (!ret) + ret = read; + + trace_parser_put(&parser); + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", 0644, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + mutex_unlock(&ftrace_lock); +} + +static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) +{ + + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + + trace_create_file("enabled_functions", 0444, + d_tracer, NULL, &ftrace_enabled_fops); + + ftrace_create_filter_files(&global_ops, d_tracer); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + trace_create_file("set_graph_function", 0444, d_tracer, + NULL, + &ftrace_graph_fops); + trace_create_file("set_graph_notrace", 0444, d_tracer, + NULL, + &ftrace_graph_notrace_fops); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + return 0; +} + +static int ftrace_cmp_ips(const void *a, const void *b) +{ + const unsigned long *ipa = a; + const unsigned long *ipb = b; + + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size) +{ + unsigned long *ipa = a; + unsigned long *ipb = b; + unsigned long t; + + t = *ipa; + *ipa = *ipb; + *ipb = t; +} + +static int ftrace_process_locs(struct module *mod, + unsigned long *start, + unsigned long *end) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long count; + unsigned long *p; + unsigned long addr; + unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + sort(start, count, sizeof(*start), + ftrace_cmp_ips, ftrace_swap_ips); + + start_pg = ftrace_allocate_pages(count); + if (!start_pg) + return -ENOMEM; + + mutex_lock(&ftrace_lock); + + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = start_pg; + } else { + if (!ftrace_pages) + goto out; + + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; + } + + ftrace_pages->next = start_pg; + } + + p = start; + pg = start_pg; + while (p < end) { + addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) + continue; + + if (pg->index == pg->size) { + /* We should have allocated enough */ + if (WARN_ON(!pg->next)) + break; + pg = pg->next; + } + + rec = &pg->records[pg->index++]; + rec->ip = addr; + } + + /* We should have used all pages */ + WARN_ON(pg->next); + + /* Assign the last page to ftrace_pages */ + ftrace_pages = pg; + + /* + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. + */ + if (!mod) + local_irq_save(flags); + ftrace_update_code(mod, start_pg); + if (!mod) + local_irq_restore(flags); + ret = 0; + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + +void ftrace_release_mod(struct module *mod) +{ + struct dyn_ftrace *rec; + struct ftrace_page **last_pg; + struct ftrace_page *pg; + int order; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; + if (within_module_core(rec->ip, mod)) { + /* + * As core pages are first, the first + * page should never be a module page. + */ + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + } else + last_pg = &pg->next; + } + out_unlock: + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) +{ + if (ftrace_disabled || start == end) + return; + ftrace_process_locs(mod, start, end); +} + +void ftrace_module_init(struct module *mod) +{ + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); +} + +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_GOING) + ftrace_release_mod(mod); + + return 0; +} +#else +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_exit_nb = { + .notifier_call = ftrace_module_notify_exit, + .priority = INT_MIN, /* Run after anything that can remove kprobes */ +}; + +void __init ftrace_init(void) +{ + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; + unsigned long count, flags; + int ret; + + local_irq_save(flags); + ret = ftrace_dyn_arch_init(); + local_irq_restore(flags); + if (ret) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); + goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, count / ENTRIES_PER_PAGE + 1); + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_process_locs(NULL, + __start_mcount_loc, + __stop_mcount_loc); + + ret = register_module_notifier(&ftrace_module_exit_nb); + if (ret) + pr_warning("Failed to register trace ftrace module exit notifier\n"); + + set_ftrace_early_filters(); + + return; + failed: + ftrace_disabled = 1; +} + +/* Do nothing if arch does not support this */ +void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ + +/* + * Currently there's no safe way to free a trampoline when the kernel + * is configured with PREEMPT. That is because a task could be preempted + * when it jumped to the trampoline, it may be preempted for a long time + * depending on the system load, and currently there's no way to know + * when it will be off the trampoline. If the trampoline is freed + * too early, when the task runs again, it will be executing on freed + * memory and crash. + */ +#ifdef CONFIG_PREEMPT + /* Currently, only non dynamic ops can have a trampoline */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + return; +#endif + + arch_ftrace_update_trampoline(ops); +} + +#else + +static struct ftrace_ops global_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +}; + +static int __init ftrace_nodyn_init(void) +{ + ftrace_enabled = 1; + return 0; +} +core_initcall(ftrace_nodyn_init); + +static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_enable(int command) { } +static inline void ftrace_startup_all(int command) { } +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) + +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + return 1; +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + +static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + + /* + * Control funcs (perf) uses RCU. Only trace if + * RCU is currently active. + */ + if (!rcu_is_watching()) + goto out; + + do_for_each_ftrace_op(op, ftrace_control_list) { + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip, regs)) + op->func(ip, parent_ip, op, regs); + } while_for_each_ftrace_op(op); + out: + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_OPS_HASH(control_ops) +}; + +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored, struct pt_regs *regs) +{ + struct ftrace_ops *op; + int bit; + + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { + pr_warn("op=%p %pS\n", op, op); + goto out; + } + op->func(ip, parent_ip, op, regs); + } + } while_for_each_ftrace_op(op); +out: + preempt_enable_notrace(); + trace_clear_recursion(bit); +} + +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); +} +#endif + +/* + * If there's only one function registered but it does not support + * recursion, this function will be called by the mcount trampoline. + * This function will handle recursion protection. + */ +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + int bit; + + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; + + op->func(ip, parent_ip, op, regs); + + trace_clear_recursion(bit); +} + +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_recurs_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) + return ftrace_ops_recurs_func; + + return ops->func; +} + +static void clear_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + clear_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void set_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + set_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void clear_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + rcu_read_lock(); + do_each_pid_task(pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); + + put_pid(pid); +} + +static void set_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + rcu_read_lock(); + do_each_pid_task(pid, PIDTYPE_PID, p) { + set_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); +} + +static void clear_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + clear_ftrace_swapper(); + else + clear_ftrace_pid(pid); +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + set_ftrace_swapper(); + else + set_ftrace_pid(pid); +} + +static int ftrace_pid_add(int p) +{ + struct pid *pid; + struct ftrace_pid *fpid; + int ret = -EINVAL; + + mutex_lock(&ftrace_lock); + + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); + + if (!pid) + goto out; + + ret = 0; + + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; + + ret = -ENOMEM; + + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; + + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; + + set_ftrace_pid_task(pid); + + ftrace_update_pid_func(); + + ftrace_startup_all(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); + } + + ftrace_update_pid_func(); + ftrace_startup_all(0); + + mutex_unlock(&ftrace_lock); +} + +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_puts(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_puts(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = kstrtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + +static const struct file_operations ftrace_pid_fops = { + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = ftrace_pid_release, +}; + +static __init int ftrace_init_tracefs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + ftrace_init_dyn_tracefs(d_tracer); + + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + + ftrace_profile_tracefs(d_tracer); + + return 0; +} +fs_initcall(ftrace_init_tracefs); + +/** + * ftrace_kill - kill ftrace + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + clear_ftrace_function(); +} + +/** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret = -1; + + ftrace_ops_init(ops); + + mutex_lock(&ftrace_lock); + + ret = ftrace_startup(ops, 0); + + mutex_unlock(&ftrace_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_function); + +/** + * unregister_ftrace_function - unregister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_lock); + ret = ftrace_shutdown(ops, 0); + mutex_unlock(&ftrace_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_function); + +int +ftrace_enable_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = -ENODEV; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) + goto out; + + last_ftrace_enabled = !!ftrace_enabled; + + if (ftrace_enabled) { + + /* we are starting ftrace again */ + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); + + ftrace_startup_sysctl(); + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_lock); + return ret; +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +static void +ftrace_graph_probe_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (trace_flags & TRACE_ITER_SLEEP_TIME) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret, cpu; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + + kfree(ret_stack_list); + return ret; +} + +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) + ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; +} + +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_lock); + + /* we currently allow only one tracer registered at a time */ + if (ftrace_graph_active) { + ret = -EBUSY; + goto out; + } + + register_pm_notifier(&ftrace_suspend_notifier); + + ftrace_graph_active++; + ret = start_graph_tracing(); + if (ret) { + ftrace_graph_active--; + goto out; + } + + ftrace_graph_return = retfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_lock); + + if (unlikely(!ftrace_graph_active)) + goto out; + + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + +#ifdef CONFIG_DYNAMIC_FTRACE + /* + * Function graph does not allocate the trampoline, but + * other global_ops do. We need to reset the ALLOC_TRAMP flag + * if one was used. + */ + global_ops.trampoline = save_global_trampoline; + if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) + global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; +#endif + + out: + mutex_unlock(&ftrace_lock); +} + +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + t->curr_ret_stack = -1; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + graph_init_task(t, ret_stack); + } +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} +#endif diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 000000000..eb4220a13 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,18 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 000000000..0315d4317 --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,5014 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ftrace_event.h> +#include <linux/ring_buffer.h> +#include <linux/trace_clock.h> +#include <linux/trace_seq.h> +#include <linux/spinlock.h> +#include <linux/irq_work.h> +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ +#include <linux/kmemcheck.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/cpu.h> + +#include <asm/local.h> + +static void update_pages_handler(struct work_struct *work); + +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + trace_seq_puts(s, "# compressed entry header\n"); + trace_seq_puts(s, "\ttype_len : 5 bits\n"); + trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + trace_seq_puts(s, "\tarray : 32 bits\n"); + trace_seq_putc(s, '\n'); + trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return !trace_seq_has_overflowed(s); +} + +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + * This has two bits: ON, DISABLED + * + * ON DISABLED + * ---- ---------- + * 0 0 : ring buffers are off + * 1 0 : ring buffers are on + * X 1 : ring buffers are permanently disabled + */ + +enum { + RB_BUFFERS_ON_BIT = 0, + RB_BUFFERS_DISABLED_BIT = 1, +}; + +enum { + RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; + +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanently. + */ +void tracing_off_permanent(void) +{ + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); +} + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; +} + +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + /* padding has a NULL time_delta */ + event->type_len = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} + +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} + +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + /* undefined */ + return -1; + return event->array[0] + RB_EVNT_HDR_SIZE; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + return rb_event_data_length(event); + default: + BUG(); + } + /* not hit */ + return 0; +} + +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_length); + +/* inline for ring buffer fast paths */ +static void * +rb_event_data(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + /* If length is in len field, then array[0] has the data */ + if (event->type_len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_data); + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; + +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ +struct buffer_page { + struct list_head list; /* list of buffer pages */ + local_t write; /* index for next write */ + unsigned read; /* index for next read */ + local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ + struct buffer_data_page *page; /* Actual data page */ +}; + +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + +static void rb_init_page(struct buffer_data_page *bpage) +{ + local_set(&bpage->commit, 0); +} + +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; +} + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static void free_buffer_page(struct buffer_page *bpage) +{ + free_page((unsigned long)bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); +} + +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + wait_queue_head_t full_waiters; + bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; +}; + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + atomic_t record_disabled; + struct ring_buffer *buffer; + raw_spinlock_t reader_lock; /* serialize readers */ + arch_spinlock_t lock; + struct lock_class_key lock_key; + unsigned int nr_pages; + struct list_head *pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* committed pages */ + struct buffer_page *reader_page; + unsigned long lost_events; + unsigned long last_overrun; + local_t entries_bytes; + local_t entries; + local_t overrun; + local_t commit_overrun; + local_t dropped_events; + local_t committing; + local_t commits; + unsigned long read; + unsigned long read_bytes; + u64 write_stamp; + u64 read_stamp; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ + int nr_pages_to_update; + struct list_head new_pages; /* new pages to add */ + struct work_struct update_pages_work; + struct completion update_done; + + struct rb_irq_work irq_work; +}; + +struct ring_buffer { + unsigned flags; + int cpus; + atomic_t record_disabled; + atomic_t resize_disabled; + cpumask_var_t cpumask; + + struct lock_class_key *reader_lock_key; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; + +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif + u64 (*clock)(void); + + struct rb_irq_work irq_work; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; + u64 read_stamp; +}; + +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); + if (rbwork->wakeup_full) { + rbwork->wakeup_full = false; + wake_up_all(&rbwork->full_waiters); + } +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) +{ + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); + DEFINE_WAIT(wait); + struct rb_irq_work *work; + int ret = 0; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) { + work = &buffer->irq_work; + /* Full only makes sense on per cpu reads */ + full = false; + } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + + while (true) { + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } + + schedule(); + } + + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); + + return ret; +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + return 0; +} + +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ + }) + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +static inline u64 rb_time_stamp(struct ring_buffer *buffer) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +{ + u64 time; + + preempt_disable_notrace(); + time = rb_time_stamp(buffer); + preempt_enable_no_resched_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the given page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static inline int +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, (unsigned long)&new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. + */ + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + +/** + * rb_check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safety measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = cpu_buffer->pages; + struct buffer_page *bpage, *tmp; + + /* Reset the head page if it exists */ + if (cpu_buffer->head_page) + rb_set_head_page(cpu_buffer); + + rb_head_page_deactivate(cpu_buffer); + + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) + return -1; + if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + return -1; + + if (rb_check_list(cpu_buffer, head)) + return -1; + + list_for_each_entry_safe(bpage, tmp, head, list) { + if (RB_WARN_ON(cpu_buffer, + bpage->list.next->prev != &bpage->list)) + return -1; + if (RB_WARN_ON(cpu_buffer, + bpage->list.prev->next != &bpage->list)) + return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; + } + + rb_head_page_activate(cpu_buffer); + + return 0; +} + +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +{ + int i; + struct buffer_page *bpage, *tmp; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + /* + * __GFP_NORETRY flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is + * not destabilized. + */ + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu)); + if (!bpage) + goto free_pages; + + list_add(&bpage->list, pages); + + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + goto free_pages; + bpage->page = page_address(page); + rb_init_page(bpage->page); + } + + return 0; + +free_pages: + list_for_each_entry_safe(bpage, tmp, pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + + return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + LIST_HEAD(pages); + + WARN_ON(!nr_pages); + + if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + return -ENOMEM; + + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); + + cpu_buffer->nr_pages = nr_pages; + + rb_check_pages(cpu_buffer); + + return 0; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + struct page *page; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + raw_spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); + init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!bpage) + goto fail_free_buffer; + + rb_check_bpage(cpu_buffer, bpage); + + cpu_buffer->reader_page = bpage; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) + goto fail_free_reader; + bpage->page = page_address(page); + rb_init_page(bpage->page); + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + ret = rb_allocate_pages(cpu_buffer, nr_pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = cpu_buffer->pages; + struct buffer_page *bpage, *tmp; + + free_buffer_page(cpu_buffer->reader_page); + + rb_head_page_deactivate(cpu_buffer); + + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); + free_buffer_page(bpage); + } + + kfree(cpu_buffer); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); +#endif + +/** + * __ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + struct ring_buffer *buffer; + int bsize; + int cpu, nr_pages; + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; + + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); + + /* need at least two pages */ + if (nr_pages < 2) + nr_pages = 2; + + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_begin(); + cpumask_copy(buffer->cpumask, cpu_online_mask); +#else + cpumask_copy(buffer->cpumask, cpu_possible_mask); +#endif + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_cpumask; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + +#ifdef CONFIG_HOTPLUG_CPU + buffer->cpu_notify.notifier_call = rb_cpu_notify; + buffer->cpu_notify.priority = 0; + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); +#endif + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif + + fail_free_buffer: + kfree(buffer); + return NULL; +} +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); +#endif + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif + + kfree(buffer->buffers); + free_cpumask_var(buffer->cpumask); + + kfree(buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_free); + +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ + struct list_head *tail_page, *to_remove, *next_page; + struct buffer_page *to_remove_page, *tmp_iter_page; + struct buffer_page *last_page, *first_page; + unsigned int nr_removed; + unsigned long head_bit; + int page_entries; + + head_bit = 0; + + raw_spin_lock_irq(&cpu_buffer->reader_lock); + atomic_inc(&cpu_buffer->record_disabled); + /* + * We don't race with the readers since we have acquired the reader + * lock. We also don't race with writers after disabling recording. + * This makes it easy to figure out the first and the last page to be + * removed from the list. We unlink all the pages in between including + * the first and last pages. This is done in a busy loop so that we + * lose the least number of traces. + * The pages are freed after we restart recording and unlock readers. + */ + tail_page = &cpu_buffer->tail_page->list; + + /* + * tail page might be on reader page, we remove the next page + * from the ring buffer + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + tail_page = rb_list_head(tail_page->next); + to_remove = tail_page; + + /* start of pages to remove */ + first_page = list_entry(rb_list_head(to_remove->next), + struct buffer_page, list); + + for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { + to_remove = rb_list_head(to_remove)->next; + head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; + } + + next_page = rb_list_head(to_remove)->next; + + /* + * Now we remove all pages between tail_page and next_page. + * Make sure that we have head_bit value preserved for the + * next page + */ + tail_page->next = (struct list_head *)((unsigned long)next_page | + head_bit); + next_page = rb_list_head(next_page); + next_page->prev = tail_page; + + /* make sure pages points to a valid page in the ring buffer */ + cpu_buffer->pages = next_page; + + /* update head page */ + if (head_bit) + cpu_buffer->head_page = list_entry(next_page, + struct buffer_page, list); + + /* + * change read pointer to make sure any read iterators reset + * themselves + */ + cpu_buffer->read = 0; + + /* pages are removed, resume tracing and then free the pages */ + atomic_dec(&cpu_buffer->record_disabled); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + + /* last buffer page to remove */ + last_page = list_entry(rb_list_head(to_remove), struct buffer_page, + list); + tmp_iter_page = first_page; + + do { + to_remove_page = tmp_iter_page; + rb_inc_page(cpu_buffer, &tmp_iter_page); + + /* update the counters */ + page_entries = rb_page_entries(to_remove_page); + if (page_entries) { + /* + * If something was added to this page, it was full + * since it is not the tail page. So we deduct the + * bytes consumed in ring buffer from here. + * Increment overrun to account for the lost events. + */ + local_add(page_entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + } + + /* + * We have already removed references to this list item, just + * free up the buffer_page and its page + */ + free_buffer_page(to_remove_page); + nr_removed--; + + } while (to_remove_page != last_page); + + RB_WARN_ON(cpu_buffer, nr_removed); + + return nr_removed == 0; +} + +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *pages = &cpu_buffer->new_pages; + int retries, success; + + raw_spin_lock_irq(&cpu_buffer->reader_lock); + /* + * We are holding the reader lock, so the reader page won't be swapped + * in the ring buffer. Now we are racing with the writer trying to + * move head page and the tail page. + * We are going to adapt the reader page update process where: + * 1. We first splice the start and end of list of new pages between + * the head page and its previous page. + * 2. We cmpxchg the prev_page->next to point from head page to the + * start of new pages list. + * 3. Finally, we update the head->prev to the end of new list. + * + * We will try this process 10 times, to make sure that we don't keep + * spinning. + */ + retries = 10; + success = 0; + while (retries--) { + struct list_head *head_page, *prev_page, *r; + struct list_head *last_page, *first_page; + struct list_head *head_page_with_bit; + + head_page = &rb_set_head_page(cpu_buffer)->list; + if (!head_page) + break; + prev_page = head_page->prev; + + first_page = pages->next; + last_page = pages->prev; + + head_page_with_bit = (struct list_head *) + ((unsigned long)head_page | RB_PAGE_HEAD); + + last_page->next = head_page_with_bit; + first_page->prev = prev_page; + + r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + + if (r == head_page_with_bit) { + /* + * yay, we replaced the page pointer to our new list, + * now, we just have to update to head page's prev + * pointer to point to end of list + */ + head_page->prev = last_page; + success = 1; + break; + } + } + + if (success) + INIT_LIST_HEAD(pages); + /* + * If we weren't successful in adding in new pages, warn and stop + * tracing + */ + RB_WARN_ON(cpu_buffer, !success); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + /* free pages if they weren't inserted */ + if (!success) { + struct buffer_page *bpage, *tmp; + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + int success; + + if (cpu_buffer->nr_pages_to_update > 0) + success = rb_insert_pages(cpu_buffer); + else + success = rb_remove_pages(cpu_buffer, + -cpu_buffer->nr_pages_to_update); + + if (success) + cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ + struct ring_buffer_per_cpu *cpu_buffer = container_of(work, + struct ring_buffer_per_cpu, update_pages_work); + rb_update_pages(cpu_buffer); + complete(&cpu_buffer->update_done); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * @cpu_id: the cpu buffer to resize + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns 0 on success and < 0 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, + int cpu_id) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages; + int cpu, err = 0; + + /* + * Always succeed at resizing a non-existent buffer: + */ + if (!buffer) + return size; + + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return size; + + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size *= BUF_PAGE_SIZE; + + /* we need a minimum of two pages */ + if (size < BUF_PAGE_SIZE * 2) + size = BUF_PAGE_SIZE * 2; + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&buffer->resize_disabled)) + return -EBUSY; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* calculate the pages to update */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + /* + * nothing more to do for removing pages or no update + */ + if (cpu_buffer->nr_pages_to_update <= 0) + continue; + /* + * to add pages, make sure all new pages can be + * allocated without receiving ENOMEM + */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto out_err; + } + } + + get_online_cpus(); + /* + * Fire off all the required work handlers + * We can't schedule on offline CPUs, but it's not necessary + * since we can change their buffer sizes without any race. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } + } + + /* wait for all the updates to complete */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + wait_for_completion(&cpu_buffer->update_done); + cpu_buffer->nr_pages_to_update = 0; + } + + put_online_cpus(); + } else { + /* Make sure this CPU has been intitialized */ + if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu_id]; + + if (nr_pages == cpu_buffer->nr_pages) + goto out; + + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (cpu_buffer->nr_pages_to_update > 0 && + __rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu_id)) { + err = -ENOMEM; + goto out_err; + } + + get_online_cpus(); + + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } + + cpu_buffer->nr_pages_to_update = 0; + put_online_cpus(); + } + + out: + /* + * The ring buffer resize can happen with the ring buffer + * enabled, so that the update disturbs the tracing as little + * as possible. But if the buffer is disabled, we do not need + * to worry about that, and we can take the time to verify + * that the buffer is not corrupt. + */ + if (atomic_read(&buffer->record_disabled)) { + atomic_inc(&buffer->record_disabled); + /* + * Even though the buffer was disabled, we must make sure + * that it is truly disabled before calling rb_check_pages. + * There could have been a race between checking + * record_disable and incrementing it. + */ + synchronize_sched(); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_check_pages(cpu_buffer); + } + atomic_dec(&buffer->record_disabled); + } + + mutex_unlock(&buffer->mutex); + return size; + + out_err: + for_each_buffer_cpu(buffer, cpu) { + struct buffer_page *bpage, *tmp; + + cpu_buffer = buffer->buffers[cpu]; + cpu_buffer->nr_pages_to_update = 0; + + if (list_empty(&cpu_buffer->new_pages)) + continue; + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + mutex_unlock(&buffer->mutex); + return err; +} +EXPORT_SYMBOL_GPL(ring_buffer_resize); + +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + +static inline void * +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) +{ + return bpage->data + index; +} + +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +{ + return bpage->page->data + index; +} + +static inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + return __rb_page_index(iter->head_page, iter->head); +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + +/* Size is determined by what has been committed */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; +} + +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + cpu_buffer->reader_page->read = 0; +} + +static void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = rb_set_head_page(cpu_buffer); + else + rb_inc_page(cpu_buffer, &iter->head_page); + + iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; +} + +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) +{ + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; + + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; + } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length = 1; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + return length; +} + +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) +{ + struct ring_buffer_event *event; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + + /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} + +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *tail_page, u64 ts) +{ + struct buffer_page *commit_page = cpu_buffer->commit_page; + struct ring_buffer *buffer = cpu_buffer->buffer; + struct buffer_page *next_page; + int ret; + + next_page = tail_page; + + rb_inc_page(cpu_buffer, &next_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { + + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) { + local_inc(&cpu_buffer->dropped_events); + goto out_reset; + } + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + } + } + + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; + } + + out_again: + + rb_reset_tail(cpu_buffer, tail_page, tail, length); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + rb_reset_tail(cpu_buffer, tail_page, tail, length); + + return NULL; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, u64 ts, + u64 delta, int add_timestamp) +{ + struct buffer_page *tail_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - length; + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + delta = 0; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, length, tail, + tail_page, ts); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = ts; + + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + + return event; +} + +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); + return 1; + } + } + + /* could not discard */ + return 0; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int nr_loops = 0; + int add_timestamp; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + length = rb_calculate_event_length(length); + again: + add_timestamp = 0; + delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable(); +#endif + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + add_timestamp = 1; + } + } + + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +#ifdef CONFIG_TRACING + +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); + +static __always_inline int trace_recursive_lock(void) +{ + unsigned int val = __this_cpu_read(current_context); + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + if (unlikely(val & (1 << bit))) + return 1; + + val |= (1 << bit); + __this_cpu_write(current_context, val); + + return 0; +} + +static __always_inline void trace_recursive_unlock(void) +{ + __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); +} + +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu; + + if (ring_buffer_flags != RB_BUFFERS_ON) + return NULL; + + /* If we are tracing schedule, we don't want to recurse */ + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + + if (trace_recursive_lock()) + goto out_nocheck; + + cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out; + + return event; + + out: + trace_recursive_unlock(); + + out_nocheck: + preempt_enable_notrace(); + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} + +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; + + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. + */ +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); +} + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + + rb_decrement_entry(cpu_buffer, event); + if (rb_try_to_discard(cpu_buffer, event)) + goto out; + + /* + * The commit is still visible by the reader, so we + * must still update the timestamp. + */ + rb_update_write_stamp(cpu_buffer, event); + out: + rb_end_commit(cpu_buffer); + + trace_recursive_unlock(); + + preempt_enable_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + void *body; + int ret = -EBUSY; + int cpu; + + if (ring_buffer_flags != RB_BUFFERS_ON) + return -EBUSY; + + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out; + + cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + ret = 0; + out: + preempt_enable_notrace(); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_write); + +static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); + struct buffer_page *commit = cpu_buffer->commit_page; + + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + + return reader->read == rb_page_commit(reader) && + (commit == reader || + (commit == head && + head->read == rb_page_commit(commit))); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable); + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable); + +/** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() version + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd | RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() version + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd & ~RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); + +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + +/** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + u64 ret = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + if (bpage) + ret = bpage->page->time_stamp; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + + return rb_num_of_entries(cpu_buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); + +/** + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm. + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->commit_overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->dropped_events); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += rb_num_of_entries(cpu_buffer); + } + + return entries; +} +EXPORT_SYMBOL_GPL(ring_buffer_entries); + +/** + * ring_buffer_overruns - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += local_read(&cpu_buffer->overrun); + } + + return overruns; +} +EXPORT_SYMBOL_GPL(ring_buffer_overruns); + +static void rb_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* Iterator usage is expected to have record disabled */ + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = cpu_buffer->read; + + if (iter->head) + iter->read_stamp = cpu_buffer->read_stamp; + else + iter->read_stamp = iter->head_page->page->time_stamp; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + return iter->head_page == cpu_buffer->commit_page && + iter->head == rb_commit_index(cpu_buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long overwrite; + unsigned long flags; + int nr_loops = 0; + int ret; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { + reader = NULL; + goto out; + } + + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + /* Never should we have an index greater than the size */ + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + + /* + * Reset the reader page to size zero. + */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; + + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); + if (!reader) + goto out; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); + cpu_buffer->reader_page->list.prev = reader->list.prev; + + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidentally swap it. + */ + cpu_buffer->pages = reader->list.prev; + + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); + + /* + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. + */ + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); + + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); + + /* + * If we did not convert it, then we must try again. + */ + if (!ret) + goto spin; + + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + rb_reset_reader_page(cpu_buffer); + + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + + goto again; + + out: + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); + + /* This function should not be called when buffer is empty */ + if (RB_WARN_ON(cpu_buffer, !reader)) + return; + + event = rb_reader_event(cpu_buffer); + + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + cpu_buffer->read++; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader_page->read += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned length; + + cpu_buffer = iter->cpu_buffer; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->head >= rb_page_size(iter->head_page)) { + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) + return; + rb_inc_iter(iter); + return; + } + + event = rb_iter_head_event(iter); + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + if (RB_WARN_ON(cpu_buffer, + (iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer)))) + return; + + rb_update_iter_read_stamp(iter, event); + + iter->head += length; + + /* check for end of page padding */ + if ((iter->head >= rb_page_size(iter->head_page)) && + (iter->head_page != cpu_buffer->commit_page)) + rb_inc_iter(iter); +} + +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + int nr_loops = 0; + + again: + /* + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + return NULL; + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + return NULL; + + event = rb_reader_event(cpu_buffer); + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); + return event; + + default: + BUG(); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_peek); + +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int nr_loops = 0; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + + /* + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) + return NULL; + + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + if (iter->head >= rb_page_size(iter->head_page)) { + rb_inc_iter(iter); + goto again; + } + + event = rb_iter_head_event(iter); + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); + +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi())) + return 1; + + tracing_off_permanent(); + return 0; +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + int dolock; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + dolock = rb_ok_to_lock(); + again: + local_irq_save(flags); + if (dolock) + raw_spin_lock(&cpu_buffer->reader_lock); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); + if (dolock) + raw_spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + again: + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; + unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); + + again: + /* might be called in atomic */ + preempt_disable(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + raw_spin_lock(&cpu_buffer->reader_lock); + + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; + rb_advance_reader(cpu_buffer); + } + + if (dolock) + raw_spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + out: + preempt_enable(); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} +EXPORT_SYMBOL_GPL(ring_buffer_consume); + +/** + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. + * + * Disabling buffer recordng prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_read_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_read_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_read_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + arch_spin_lock(&cpu_buffer->lock); + rb_iter_reset(iter); + arch_spin_unlock(&cpu_buffer->lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_start); + +/** + * ring_buffer_read_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + /* + * Ring buffer is disabled from recording, here's a good place + * to check the integrity of the ring buffer. + * Must prevent readers from trying to read, as the check + * clears the HEAD page and readers require it. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->buffer->resize_disabled); + kfree(iter); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_finish); + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + again: + event = rb_iter_peek(iter, ts); + if (!event) + goto out; + + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + rb_advance_iter(iter); + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} +EXPORT_SYMBOL_GPL(ring_buffer_read); + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) +{ + /* + * Earlier, this method returned + * BUF_PAGE_SIZE * buffer->nr_pages + * Since the nr_pages field is now removed, we have converted this to + * return the per cpu buffer value. + */ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; +} +EXPORT_SYMBOL_GPL(ring_buffer_size); + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + rb_head_page_deactivate(cpu_buffer); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); + + cpu_buffer->head_page->read = 0; + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->read = 0; + + local_set(&cpu_buffer->entries_bytes, 0); + local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->dropped_events, 0); + local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + + cpu_buffer->write_stamp = 0; + cpu_buffer->read_stamp = 0; + + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + + rb_head_page_activate(cpu_buffer); +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + unsigned long flags; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + atomic_inc(&buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_sched(); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&buffer->resize_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + ring_buffer_reset_cpu(buffer, cpu); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset); + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; + int cpu; + int ret; + + dolock = rb_ok_to_lock(); + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + raw_spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + raw_spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (!ret) + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty); + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; + int ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 1; + + dolock = rb_ok_to_lock(); + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + raw_spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + raw_spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; + + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) + goto out; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + /* At least make sure the two buffers are somewhat the same */ + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) + goto out; + + ret = -EAGAIN; + + if (ring_buffer_flags != RB_BUFFERS_ON) + goto out; + + if (atomic_read(&buffer_a->record_disabled)) + goto out; + + if (atomic_read(&buffer_b->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_a->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + goto out; + + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + ret = 0; + +out_dec: + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); +out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) +{ + struct buffer_data_page *bpage; + struct page *page; + + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + return NULL; + + bpage = page_address(page); + + rb_init_page(bpage); + + return bpage; +} +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_read_page(buffer, cpu); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * if (ret >= 0) + * process_page(rpage, ret); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * >=0 if data has been transferred, returns the offset of consumed data. + * <0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, size_t len, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long flags; + unsigned int commit; + unsigned int read; + u64 save_timestamp; + int ret = -1; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + goto out; + + len -= BUF_PAGE_HDR_SIZE; + + if (!data_page) + goto out; + + bpage = *data_page; + if (!bpage) + goto out; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + goto out_unlock; + + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + /* + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. + */ + if (read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { + struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int rpos = read; + unsigned int pos = 0; + unsigned int size; + + if (full) + goto out_unlock; + + if (len > (commit - read)) + len = (commit - read); + + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + + if (len < size) + goto out_unlock; + + /* save the current timestamp, since the user will need it */ + save_timestamp = cpu_buffer->read_stamp; + + /* Need to copy one event at a time */ + do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); + memcpy(bpage->data + pos, rpage->data + rpos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + rpos = reader->read; + pos += size; + + if (rpos >= commit) + break; + + event = rb_reader_event(cpu_buffer); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); + + /* update bpage */ + local_set(&bpage->commit, pos); + bpage->time_stamp = save_timestamp; + + /* we copied everything to the beginning */ + read = 0; + } else { + /* update the entry counter */ + cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; + + /* swap the pages */ + rb_init_page(bpage); + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + local_set(&reader->entries, 0); + reader->read = 0; + *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + } + ret = read; + + cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) { + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + + out_unlock: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page); + +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct ring_buffer *buffer = + container_of(self, struct ring_buffer, cpu_notify); + long cpu = (long)hcpu; + int cpu_i, nr_pages_same; + unsigned int nr_pages; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return NOTIFY_OK; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; + } + } + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %ld\n", + cpu); + return NOTIFY_OK; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Do nothing. + * If we were to free the buffer, then the user would + * lose any trace that was in the buffer. + */ + break; + default: + break; + } + return NOTIFY_OK; +} +#endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct ring_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer, event); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct ring_buffer *buffer; + int cpu; + int ret = 0; + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], + "rbtester/%d", cpu); + if (WARN_ON(!rb_threads[cpu])) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + kthread_bind(rb_threads[cpu], cpu); + wake_up_process(rb_threads[cpu]); + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(!rb_hammer)) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + if (!total_lost) { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 000000000..1b28df2d9 --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,483 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/ktime.h> +#include <asm/local.h> + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10ULL +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts, NULL); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer, cpu); + if (!bpage) + return EVENT_DROPPED; + + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + /* The commit may have missed event flags set, clear them */ + commit = local_read(&rpage->commit) & 0xfffff; + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* failed writes may be discarded events */ + if (!event->time_delta) + KILL_TEST(); + inc = event->array[0] + 4; + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0] + 4; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + ktime_t start_time, end_time, timeout; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + trace_printk("Starting ring buffer hammer\n"); + start_time = ktime_get(); + timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC); + do { + struct ring_buffer_event *event; + int *entry; + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + } + end_time = ktime_get(); + + cnt++; + if (consumer && !(cnt % wakeup_interval)) + wake_up_process(consumer); + +#ifndef CONFIG_PREEMPT + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. + */ + if (cnt % wakeup_interval) + cond_resched(); +#endif + + } while (ktime_before(end_time, timeout) && !kill_test); + trace_printk("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = ktime_us_delta(end_time, start_time); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + + trace_printk("Time: %lld (usecs)\n", time); + trace_printk("Overruns: %lld\n", overruns); + if (disable_reader) + trace_printk("Read: (reader disabled)\n"); + else + trace_printk("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + trace_printk("Entries: %lld\n", entries); + trace_printk("Total: %lld\n", entries + overruns + read); + trace_printk("Missed: %ld\n", missed); + trace_printk("Hit: %ld\n", hit); + + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); + if (time) + hit /= (long)time; + else + trace_printk("TIME IS ZERO??\n"); + + trace_printk("Entries per millisec: %ld\n", hit); + + if (hit) { + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; + trace_printk("%ld ns per entry\n", avg); + } + + if (missed) { + if (time) + missed /= (long)time; + + trace_printk("Total iterations per millisec: %ld\n", + hit + missed); + + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + trace_printk("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); + trace_printk("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + trace_printk("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = producer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000..4b3b5eaf9 --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 000000000..05330494a --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,7205 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/ring_buffer.h> +#include <generated/utsrelease.h> +#include <linux/stacktrace.h> +#include <linux/writeback.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/notifier.h> +#include <linux/irqflags.h> +#include <linux/debugfs.h> +#include <linux/tracefs.h> +#include <linux/pagemap.h> +#include <linux/hardirq.h> +#include <linux/linkage.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/splice.h> +#include <linux/kdebug.h> +#include <linux/string.h> +#include <linux/mount.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/fs.h> +#include <linux/sched/rt.h> + +#include "trace.h" +#include "trace_output.h" + +/* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ +bool ring_buffer_expanded; + +/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as trace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static bool __read_mostly tracing_selftest_running; + +/* + * If a tracer is running, we do not want to run SELFTEST. + */ +bool __read_mostly tracing_selftest_disabled; + +/* Pipe tracepoints to printk */ +struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; + +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static struct tracer_flags dummy_tracer_flags = { + .val = 0, + .opts = dummy_tracer_opt +}; + +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + return 0; +} + +/* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +static int tracing_disabled = 1; + +DEFINE_PER_CPU(int, ftrace_cpu_disabled); + +cpumask_var_t __read_mostly tracing_buffer_mask; + +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops + */ + +enum ftrace_dump_mode ftrace_dump_on_oops; + +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +/* Map of enums to their values, for "enum_map" file */ +struct trace_enum_map_head { + struct module *mod; + unsigned long length; +}; + +union trace_enum_map_item; + +struct trace_enum_map_tail { + /* + * "end" is first and points to NULL as it must be different + * than "mod" or "enum_string" + */ + union trace_enum_map_item *next; + const char *end; /* points to NULL */ +}; + +static DEFINE_MUTEX(trace_enum_mutex); + +/* + * The trace_enum_maps are saved in an array with two extra elements, + * one at the beginning, and one at the end. The beginning item contains + * the count of the saved maps (head.length), and the module they + * belong to if not built in (head.mod). The ending item contains a + * pointer to the next array of saved enum_map items. + */ +union trace_enum_map_item { + struct trace_enum_map map; + struct trace_enum_map_head head; + struct trace_enum_map_tail tail; +}; + +static union trace_enum_map_item *trace_enum_maps; +#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ + +static int tracing_set_tracer(struct trace_array *tr, const char *buf); + +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; +static char *default_bootup_tracer; + +static bool allocate_snapshot; + +static int __init set_cmdline_ftrace(char *str) +{ + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + default_bootup_tracer = bootup_tracer_buf; + /* We are using ftrace early, expand it */ + ring_buffer_expanded = true; + return 1; +} +__setup("ftrace=", set_cmdline_ftrace); + +static int __init set_ftrace_dump_on_oops(char *str) +{ + if (*str++ != '=' || !*str) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + +static int __init stop_trace_on_warning(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning", stop_trace_on_warning); + +static int __init boot_alloc_snapshot(char *str) +{ + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_options __initdata; + +static int __init set_trace_boot_options(char *str) +{ + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_boot_options = trace_boot_options_buf; + return 0; +} +__setup("trace_options=", set_trace_boot_options); + +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + +static int __init set_tracepoint_printk(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); + +unsigned long long ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ +static struct trace_array global_trace; + +LIST_HEAD(ftrace_trace_arrays); + +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ + WARN_ON(!this_tr->ref); + this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} + +int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + !filter_match_preds(file->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(call_filter_check_discard); + +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +{ + u64 ts; + + /* Early boot up does not have a buffer yet */ + if (!buf->buffer) + return trace_clock_local(); + + ts = ring_buffer_time_stamp(buf->buffer, cpu); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); + + return ts; +} + +cycle_t ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +} + +/** + * tracing_is_enabled - Show if global_trace has been disabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */ +int tracing_is_enabled(void) +{ + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; +} + +/* + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. + */ +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* + * trace_types_lock is used to protect the trace_types list. + */ +DEFINE_MUTEX(trace_types_lock); + +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + +/* trace_flags holds trace_options default values */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; + +static void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_on(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} + +/** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + tracer_tracing_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + int alloc; + int pc; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + pc = preempt_count(); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); + + return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct bputs_entry *entry; + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + int pc; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + pc = preempt_count(); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + struct tracer *tracer = tr->current_trace; + unsigned long flags; + + if (in_nmi()) { + internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + internal_trace_puts("*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); + internal_trace_puts("*** stopping trace here! ***\n"); + tracing_off(); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); + internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id()); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +static void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} + +/** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_off(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + tracer_tracing_off(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_off); + +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) + tracing_off(); +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +static int tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + return ring_buffer_record_is_on(tr->trace_buffer.buffer); + return !tr->buffer_disabled; +} + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return tracer_tracing_is_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_is_on); + +static int __init set_buf_size(char *str) +{ + unsigned long buf_size; + + if (!str) + return 0; + buf_size = memparse(str, &str); + /* nr_entries can not be zero */ + if (buf_size == 0) + return 0; + trace_buf_size = buf_size; + return 1; +} +__setup("trace_buf_size=", set_buf_size); + +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshold; + int ret; + + if (!str) + return 0; + ret = kstrtoul(str, 0, &threshold); + if (ret < 0) + return 0; + tracing_thresh = threshold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* These must match the bit postions in trace_iterator_flags */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + "raw", + "hex", + "bin", + "block", + "stacktrace", + "trace_printk", + "ftrace_preempt", + "branch", + "annotate", + "userstacktrace", + "sym-userobj", + "printk-msg-only", + "context-info", + "latency-format", + "sleep-time", + "graph-time", + "record-cmd", + "overwrite", + "disable_on_free", + "irq-info", + "markers", + "function-trace", + NULL +}; + +static struct { + u64 (*func)(void); + const char *name; + int in_ns; /* is this clock in nanoseconds? */ +} trace_clocks[] = { + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, + ARCH_TRACE_CLOCKS +}; + +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size - 1) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else if (parser->idx < parser->size - 1) { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } else { + ret = -EINVAL; + goto out; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + +/* TODO add a seq_buf_to_buffer() */ +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +{ + int len; + + if (trace_seq_used(s) <= s->seq.readpos) + return -EBUSY; + + len = trace_seq_used(s) - s->seq.readpos; + if (cnt > len) + cnt = len; + memcpy(buf, s->buffer + s->seq.readpos, cnt); + + s->seq.readpos += cnt; + return cnt; +} + +unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_buffer *trace_buf = &tr->trace_buffer; + struct trace_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); + + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; + + max_data->saved_latency = tr->max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + max_data->pid = tsk->pid; + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct ring_buffer *buf; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + buf = tr->trace_buffer.buffer; + tr->trace_buffer.buffer = tr->max_buffer.buffer; + tr->max_buffer.buffer = buf; + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + */ + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, + "Failed to swap buffers due to commit in progress\n"); + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int wait_on_pipe(struct trace_iterator *iter, bool full) +{ + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return 0; + + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; + + if (!type->selftest || tracing_selftest_disabled) + return 0; + + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + tracing_reset_online_cpus(&tr->trace_buffer); + + tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; +} +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + if (strlen(type->name) >= MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + + mutex_lock(&trace_types_lock); + + tracing_selftest_running = true; + + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Tracer %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) + type->flags = &dummy_tracer_flags; + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + + ret = run_tracer_selftest(type); + if (ret < 0) + goto out; + + type->next = trace_types; + trace_types = type; + + out: + tracing_selftest_running = false; + mutex_unlock(&trace_types_lock); + + if (ret || !default_bootup_tracer) + goto out_unlock; + + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) + goto out_unlock; + + printk(KERN_INFO "Starting tracer '%s'\n", type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(&global_trace, type->name); + default_bootup_tracer = NULL; + /* disable other selftests, since this will break it. */ + tracing_selftest_disabled = true; +#ifdef CONFIG_FTRACE_STARTUP_TEST + printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", + type->name); +#endif + + out_unlock: + return ret; +} + +void tracing_reset(struct trace_buffer *buf, int cpu) +{ + struct ring_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + ring_buffer_reset_cpu(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + +void tracing_reset_online_cpus(struct trace_buffer *buf) +{ + struct ring_buffer *buffer = buf->buffer; + int cpu; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + for_each_online_cpu(cpu) + ring_buffer_reset_cpu(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + +/* Must have trace_types_lock held */ +void tracing_reset_all_online_cpus(void) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif + } +} + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; + +/* temporary disable recording */ +static atomic_t trace_record_cmdline_disabled __read_mostly; + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; +} + +int is_tracing_stopped(void) +{ + return global_trace.stop_count; +} + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (--global_trace.stop_count) { + if (global_trace.stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + global_trace.stop_count = 0; + } + goto out; + } + + /* Prevent the buffers from switching */ + arch_spin_lock(&global_trace.max_lock); + + buffer = global_trace.trace_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); +#endif + + arch_spin_unlock(&global_trace.max_lock); + + out: + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + /* If global, we need to also start the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_start(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + + if (--tr->stop_count) { + if (tr->stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + tr->stop_count = 0; + } + goto out; + } + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (global_trace.stop_count++) + goto out; + + /* Prevent the buffers from switching */ + arch_spin_lock(&global_trace.max_lock); + + buffer = global_trace.trace_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); +#endif + + arch_spin_unlock(&global_trace.max_lock); + + out: + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + /* If global, we need to also stop the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_stop(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); +} + +void trace_stop_cmdline_recording(void); + +static int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned pid, idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return 0; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + /* + * Check whether the cmdline buffer at idx has a pid + * mapped. We are going to overwrite that entry so we + * need to clear the map_pid_to_cmdline. Otherwise we + * would read the new comm for the old pid. + */ + pid = savedcmd->map_cmdline_to_pid[idx]; + if (pid != NO_CMDLINE_MAP) + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; + + savedcmd->cmdline_idx = idx; + } + + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + + if (!pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + if (pid > PID_MAX_DEFAULT) { + strcpy(comm, "<...>"); + return; + } + + map = savedcmd->map_pid_to_cmdline[pid]; + if (map != NO_CMDLINE_MAP) + strcpy(comm, get_saved_cmdlines(map)); + else + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + return; + + if (!__this_cpu_read(trace_cmdline_save)) + return; + + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); +} + +void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + int pc) +{ + struct task_struct *tsk = current; + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->flags = +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +#else + TRACE_FLAG_IRQS_NOSUPPORT | +#endif + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); +} +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) { + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; + } + + return event; +} + +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + ring_buffer_unlock_commit(buffer, event); +} + +static inline void +__trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __buffer_unlock_commit(buffer, event); + + ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_userstack(buffer, flags, pc); +} + +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(buffer, event, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); + +static struct ring_buffer *temp_buffer; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *entry; + + *current_rb = ftrace_file->tr->trace_buffer.buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the tigger to use. It's recusive + * safe and will not be recorded anywhere. + */ + if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + } + return entry; +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event * +trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, + int type, unsigned long len, + unsigned long flags, int pc) +{ + *current_rb = global_trace.trace_buffer.buffer; + return trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); + +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(buffer, event, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); + +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) +{ + __buffer_unlock_commit(buffer, event); + + ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_userstack(buffer, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); + +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(buffer, event); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); + +void +trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_function; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; + + /* If we are reading the ring buffer, don't trace */ + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); +} + +#ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { + unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + +static void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + struct ftrace_event_call *call = &event_kernel_stack; + struct ring_buffer_event *event; + struct stack_entry *entry; + struct stack_trace trace; + int use_stack; + int size = FTRACE_STACK_ENTRIES; + + trace.nr_entries = 0; + trace.skip = skip; + + /* + * Since events can happen in NMIs there's no safe way to + * use the per cpu ftrace_stacks. We reserve it and if an interrupt + * or NMI comes in, it will just have to use the default + * FTRACE_STACK_SIZE. + */ + preempt_disable_notrace(); + + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); + /* + * We don't need any atomic variables, just a barrier. + * If an interrupt comes in, we don't care, because it would + * have exited and put the counter back to what we want. + * We just need a barrier to keep gcc from moving things + * around. + */ + barrier(); + if (use_stack == 1) { + trace.entries = this_cpu_ptr(ftrace_stack.calls); + trace.max_entries = FTRACE_STACK_MAX_ENTRIES; + + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; + } else + /* From now on, use_stack is a boolean */ + use_stack = 0; + + size *= sizeof(unsigned long); + + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, + sizeof(*entry) + size, flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + + memset(&entry->caller, 0, size); + + if (use_stack) + memcpy(&entry->caller, trace.entries, + trace.nr_entries * sizeof(unsigned long)); + else { + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.entries = entry->caller; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + } + + entry->size = trace.nr_entries; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __this_cpu_dec(ftrace_stack_reserve); + preempt_enable_notrace(); + +} + +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, regs); +} + +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); +} + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) +{ + __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); +} + +/** + * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) + */ +void trace_dump_stack(int skip) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return; + + local_save_flags(flags); + + /* + * Skip 3 more, seems to get us at the caller of + * this function. + */ + skip += 3; + __ftrace_trace_stack(global_trace.trace_buffer.buffer, + flags, skip, preempt_count(), NULL); +} + +static DEFINE_PER_CPU(int, user_stack_count); + +void +ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), flags, pc); + if (!event) + goto out_drop_count; + entry = ring_buffer_event_data(event); + + entry->tgid = current->tgid; + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = 0; + trace.entries = entry->caller; + + save_stack_trace_user(&trace); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + out_drop_count: + __this_cpu_dec(user_stack_count); + out: + preempt_enable(); +} + +#ifdef UNUSED +static void __trace_userstack(struct trace_array *tr, unsigned long flags) +{ + ftrace_trace_userstack(tr, flags, preempt_count()); +} +#endif /* UNUSED */ + +#endif /* CONFIG_STACKTRACE */ + +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *percpu_buffer; + + /* + * If we have allocated per cpu buffers, then we do not + * need to do any locking. + */ + if (in_nmi()) + percpu_buffer = trace_percpu_nmi_buffer; + else if (in_irq()) + percpu_buffer = trace_percpu_irq_buffer; + else if (in_softirq()) + percpu_buffer = trace_percpu_sirq_buffer; + else + percpu_buffer = trace_percpu_buffer; + + if (!percpu_buffer) + return NULL; + + return this_cpu_ptr(&percpu_buffer->buffer[0]); +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct *buffers; + struct trace_buffer_struct *sirq_buffers; + struct trace_buffer_struct *irq_buffers; + struct trace_buffer_struct *nmi_buffers; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (!buffers) + goto err_warn; + + sirq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!sirq_buffers) + goto err_sirq; + + irq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!irq_buffers) + goto err_irq; + + nmi_buffers = alloc_percpu(struct trace_buffer_struct); + if (!nmi_buffers) + goto err_nmi; + + trace_percpu_buffer = buffers; + trace_percpu_sirq_buffer = sirq_buffers; + trace_percpu_irq_buffer = irq_buffers; + trace_percpu_nmi_buffer = nmi_buffers; + + return 0; + + err_nmi: + free_percpu(irq_buffers); + err_irq: + free_percpu(sirq_buffers); + err_sirq: + free_percpu(buffers); + err_warn: + WARN(1, "Could not allocate percpu trace_printk buffer"); + return -ENOMEM; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n"); + pr_warning("**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for production use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); + + /* Expand the buffers to set size */ + tracing_update_buffers(); + + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. If the global_trace.buffer is already + * allocated here, then this was called by module code. + */ + if (global_trace.trace_buffer.buffer) + tracing_start_cmdline_record(); +} + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); +} + +/** + * trace_vbprintk - write binary msg to tracing buffer + * + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + struct ftrace_event_call *call = &event_bprint; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct trace_array *tr = &global_trace; + struct bprint_entry *entry; + unsigned long flags; + char *tbuffer; + int len = 0, size, pc; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + pc = preempt_count(); + preempt_disable_notrace(); + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out; + } + + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); + + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out; + + local_save_flags(flags); + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } + +out: + preempt_enable_notrace(); + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + +static int +__trace_array_vprintk(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) +{ + struct ftrace_event_call *call = &event_print; + struct ring_buffer_event *event; + int len = 0, size, pc; + struct print_entry *entry; + unsigned long flags; + char *tbuffer; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + pc = preempt_count(); + preempt_disable_notrace(); + + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out; + } + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + + local_save_flags(flags); + size = sizeof(*entry) + len + 1; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, tbuffer, len + 1); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } + out: + preempt_enable_notrace(); + unpause_graph_tracing(); + + return len; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(&global_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + + iter->idx++; + if (buf_iter) + ring_buffer_read(buf_iter, NULL); +} + +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); + + if (buf_iter) + event = ring_buffer_iter_peek(buf_iter, ts); + else + event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, + lost_events); + + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; +} + +static struct trace_entry * +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) +{ + struct ring_buffer *buffer = iter->trace_buffer->buffer; + struct trace_entry *ent, *next = NULL; + unsigned long lost_events = 0, next_lost = 0; + int cpu_file = iter->cpu_file; + u64 next_ts = 0, ts; + int next_cpu = -1; + int next_size = 0; + int cpu; + + /* + * If we are in a per_cpu trace file, don't bother by iterating over + * all cpu and peek directly. + */ + if (cpu_file > RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(buffer, cpu_file)) + return NULL; + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); + if (ent_cpu) + *ent_cpu = cpu_file; + + return ent; + } + + for_each_tracing_cpu(cpu) { + + if (ring_buffer_empty_cpu(buffer, cpu)) + continue; + + ent = peek_next_entry(iter, cpu, &ts, &lost_events); + + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ts < next_ts)) { + next = ent; + next_cpu = cpu; + next_ts = ts; + next_lost = lost_events; + next_size = iter->ent_size; + } + } + + iter->ent_size = next_size; + + if (ent_cpu) + *ent_cpu = next_cpu; + + if (ent_ts) + *ent_ts = next_ts; + + if (missing_events) + *missing_events = next_lost; + + return next; +} + +/* Find the next real entry, without updating the iterator itself */ +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts) +{ + return __find_next_entry(iter, ent_cpu, NULL, ent_ts); +} + +/* Find the next real entry, and increment the iterator to the next entry */ +void *trace_find_next_entry_inc(struct trace_iterator *iter) +{ + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); + + if (iter->ent) + trace_iterator_increment(iter); + + return iter->ent ? iter : NULL; +} + +static void trace_consume(struct trace_iterator *iter) +{ + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, + &iter->lost_events); +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + int i = (int)*pos; + void *ent; + + WARN_ON_ONCE(iter->leftover); + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = trace_find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = trace_find_next_entry_inc(iter); + + iter->pos = *pos; + + return ent; +} + +void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; + + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) + return; + + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + if (ts >= iter->trace_buffer->time_start) + break; + entries++; + ring_buffer_read(buf_iter, NULL); + } + + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; +} + +/* + * The current tracer is copied to avoid a global locking + * all around. + */ +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + int cpu_file = iter->cpu_file; + void *p = NULL; + loff_t l = 0; + int cpu; + + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ + mutex_lock(&trace_types_lock); + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; + mutex_unlock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); +#endif + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + if (cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) + tracing_iter_reset(iter, cpu); + } else + tracing_iter_reset(iter, cpu_file); + + iter->leftover = 0; + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } + } + + trace_event_read_lock(); + trace_access_lock(cpu_file); + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return; +#endif + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); + + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); +} + +static void +get_total_entries(struct trace_buffer *buf, + unsigned long *total, unsigned long *entries) +{ + unsigned long count; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total += count; + } else + *total += count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries += count; + } +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); +} + +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) +{ + unsigned long total; + unsigned long entries; + + get_total_entries(buf, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" + "# | | | | |\n"); +} + +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# _-----=> irqs-off\n" + "# / _----=> need-resched\n" + "# | / _---=> hardirq/softirq\n" + "# || / _--=> preempt-depth\n" + "# ||| / delay\n" + "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" + "# | | | |||| | |\n"); +} + +void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_buffer *buf = iter->trace_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); + struct tracer *type = iter->trace; + unsigned long entries; + unsigned long total; + const char *name = "preemption"; + + name = type->name; + + get_total_entries(buf, &total, &entries); + + seq_printf(m, "# %s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "# -----------------------------------" + "---------------------------------\n"); + seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + buf->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, "# -----------------\n"); + seq_printf(m, "# | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, + data->policy, data->rt_priority); + seq_puts(m, "# -----------------\n"); + + if (data->critical_start) { + seq_puts(m, "# => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n# => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n#\n"); + } + + seq_puts(m, "#\n"); +} + +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + + if (!(trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + + if (cpumask_test_cpu(iter->cpu, iter->started)) + return; + + if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) + return; + + cpumask_set_cpu(iter->cpu, iter->started); + + /* Don't print started cpu buffer for the first entry of the trace */ + if (iter->idx > 1) + trace_seq_printf(s, "##### CPU %u buffer started ####\n", + iter->cpu); +} + +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + test_cpu_buff_start(iter); + + event = ftrace_find_event(entry->type); + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); + } + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (event) + return event->funcs->trace(iter, sym_flags, event); + + trace_seq_printf(s, "Unknown type %d\n", entry->type); + + return trace_handle_return(s); +} + +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + event = ftrace_find_event(entry->type); + if (event) + return event->funcs->raw(iter, 0, event); + + trace_seq_printf(s, "%d ?\n", entry->type); + + return trace_handle_return(s); +} + +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + } + + event = ftrace_find_event(entry->type); + if (event) { + enum print_line_t ret = event->funcs->hex(iter, 0, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + SEQ_PUT_FIELD(s, newline); + + return trace_handle_return(s); +} + +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + } + + event = ftrace_find_event(entry->type); + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; +} + +int trace_empty(struct trace_iterator *iter) +{ + struct ring_buffer_iter *buf_iter; + int cpu; + + /* If we are looking at one CPU buffer, only check that one */ + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + cpu = iter->cpu_file; + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + return 0; + } + return 1; + } + + for_each_tracing_cpu(cpu) { + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + return 0; + } + } + + return 1; +} + +/* Called with trace_event_read_lock() held. */ +enum print_line_t print_trace_line(struct trace_iterator *iter) +{ + enum print_line_t ret; + + if (iter->lost_events) { + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } + + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + + if (iter->ent->type == TRACE_BPRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bprintk_msg_only(iter); + + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_printk_msg_only(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + return print_trace_fmt(iter); +} + +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + +void trace_default_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(iter->trace_buffer, m); + else + print_func_help_header(iter->trace_buffer, m); + } + } +} + +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); +} + +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); +#endif + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_puts(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + int ret; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + test_ftrace_alive(m); + } + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else + trace_default_header(m); + + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + + } else { + print_trace_line(iter); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; + } + + return 0; +} + +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + +static const struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file, bool snapshot) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int cpu; + + if (tracing_disabled) + return ERR_PTR(-ENODEV); + + iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + + /* + * We make a copy of the current tracer to avoid concurrent + * changes on it while we are reading. + */ + mutex_lock(&trace_types_lock); + iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); + if (!iter->trace) + goto fail; + + *iter->trace = *tr->current_trace; + + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) + goto fail; + + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) + iter->trace_buffer = &tr->max_buffer; + else +#endif + iter->trace_buffer = &tr->trace_buffer; + iter->snapshot = snapshot; + iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); + mutex_init(&iter->mutex); + + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop_tr(tr); + + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + } else { + cpu = iter->cpu_file; + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + + mutex_unlock(&trace_types_lock); + + return iter; + + fail: + mutex_unlock(&trace_types_lock); + kfree(iter->trace); + kfree(iter->buffer_iter); +release: + seq_release_private(inode, file); + return ERR_PTR(-ENOMEM); +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + filp->private_data = inode->i_private; + return 0; +} + +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; +} + +static int tracing_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m = file->private_data; + struct trace_iterator *iter; + int cpu; + + if (!(file->f_mode & FMODE_READ)) { + trace_array_put(tr); + return 0; + } + + /* Writes do not use seq_file */ + iter = m->private; + mutex_lock(&trace_types_lock); + + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start_tr(tr); + + __trace_array_put(tr); + + mutex_unlock(&trace_types_lock); + + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); + kfree(iter->trace); + kfree(iter->buffer_iter); + seq_release_private(inode, file); + + return 0; +} + +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int ret = 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + /* If this file was open for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); + + if (cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->trace_buffer); + else + tracing_reset(&tr->trace_buffer, cpu); + } + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, false); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + } + + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct tracer *t = v; + + (*pos)++; + + if (t) + t = get_tracer_for_array(tr, t->next); + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct tracer *t; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_puts(m, t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + if (tracing_disabled) + return -ENODEV; + + ret = seq_open(file, &show_traces_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = tr; + + return 0; +} + +static ssize_t +tracing_write_stub(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + return count; +} + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) +{ + int ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 0; + + return ret; +} + +static const struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .write = tracing_write_stub, + .llseek = tracing_lseek, + .release = tracing_release, +}; + +static const struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + int len; + + mutex_lock(&tracing_cpumask_update_lock); + + len = snprintf(mask_str, count, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_err; + } + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + cpumask_var_t tracing_cpumask_new; + int err, cpu; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_unlock; + + mutex_lock(&tracing_cpumask_update_lock); + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); + } + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); + } + } + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); + + mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); + + return count; + +err_unlock: + free_cpumask_var(tracing_cpumask_new); + + return err; +} + +static const struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic_tr, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, + .llseek = generic_file_llseek, +}; + +static int tracing_trace_options_show(struct seq_file *m, void *v) +{ + struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; + u32 tracer_flags; + int i; + + mutex_lock(&trace_types_lock); + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + seq_printf(m, "%s\n", trace_options[i]); + else + seq_printf(m, "no%s\n", trace_options[i]); + } + + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + seq_printf(m, "%s\n", trace_opts[i].name); + else + seq_printf(m, "no%s\n", trace_opts[i].name); + } + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int __set_tracer_option(struct trace_array *tr, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + struct tracer *trace = tr->current_trace; + int ret; + + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; +} + +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) +{ + struct tracer *trace = tr->current_trace; + struct tracer_flags *tracer_flags = trace->flags; + struct tracer_opt *opts = NULL; + int i; + + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; + + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(tr, trace->flags, opts, neg); + } + + return -EINVAL; +} + +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +{ + /* do nothing if flag is already set */ + if (!!(trace_flags & mask) == !!enabled) + return 0; + + /* Give the tracer a chance to approve the change */ + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) + return -EINVAL; + + if (enabled) + trace_flags |= mask; + else + trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) { + ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif + } + + if (mask == TRACE_ITER_PRINTK) + trace_printk_start_stop_comm(enabled); + + return 0; +} + +static int trace_set_options(struct trace_array *tr, char *option) +{ + char *cmp; + int neg = 0; + int ret = -ENODEV; + int i; + + cmp = strstrip(option); + + if (strncmp(cmp, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + mutex_lock(&trace_types_lock); + + for (i = 0; trace_options[i]; i++) { + if (strcmp(cmp, trace_options[i]) == 0) { + ret = set_tracer_flag(tr, 1 << i, !neg); + break; + } + } + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) + ret = set_tracer_option(tr, cmp, neg); + + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = trace_set_options(tr, buf); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static const struct file_operations tracing_iter_fops = { + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, + .write = tracing_trace_options_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t-change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t\t dump\n" + "\t\t cpudump\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" +#endif +#ifdef CONFIG_STACK_TRACER + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" +#endif +#endif /* CONFIG_STACK_TRACER */ + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, + .llseek = generic_file_llseek, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); + return -ENOMEM; + } + + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; + +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +static union trace_enum_map_item * +update_enum_map(union trace_enum_map_item *ptr) +{ + if (!ptr->map.enum_string) { + if (ptr->tail.next) { + ptr = ptr->tail.next; + /* Set ptr to the next real item (skip head) */ + ptr++; + } else + return NULL; + } + return ptr; +} + +static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +{ + union trace_enum_map_item *ptr = v; + + /* + * Paranoid! If ptr points to end, we don't want to increment past it. + * This really should never happen. + */ + ptr = update_enum_map(ptr); + if (WARN_ON_ONCE(!ptr)) + return NULL; + + ptr++; + + (*pos)++; + + ptr = update_enum_map(ptr); + + return ptr; +} + +static void *enum_map_start(struct seq_file *m, loff_t *pos) +{ + union trace_enum_map_item *v; + loff_t l = 0; + + mutex_lock(&trace_enum_mutex); + + v = trace_enum_maps; + if (v) + v++; + + while (v && l < *pos) { + v = enum_map_next(m, v, &l); + } + + return v; +} + +static void enum_map_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&trace_enum_mutex); +} + +static int enum_map_show(struct seq_file *m, void *v) +{ + union trace_enum_map_item *ptr = v; + + seq_printf(m, "%s %ld (%s)\n", + ptr->map.enum_string, ptr->map.enum_value, + ptr->map.system); + + return 0; +} + +static const struct seq_operations tracing_enum_map_seq_ops = { + .start = enum_map_start, + .next = enum_map_next, + .stop = enum_map_stop, + .show = enum_map_show, +}; + +static int tracing_enum_map_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_enum_map_seq_ops); +} + +static const struct file_operations tracing_enum_map_fops = { + .open = tracing_enum_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static inline union trace_enum_map_item * +trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +{ + /* Return tail of array given the head */ + return ptr + ptr->head.length + 1; +} + +static void +trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, + int len) +{ + struct trace_enum_map **stop; + struct trace_enum_map **map; + union trace_enum_map_item *map_array; + union trace_enum_map_item *ptr; + + stop = start + len; + + /* + * The trace_enum_maps contains the map plus a head and tail item, + * where the head holds the module and length of array, and the + * tail holds a pointer to the next list. + */ + map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); + if (!map_array) { + pr_warning("Unable to allocate trace enum mapping\n"); + return; + } + + mutex_lock(&trace_enum_mutex); + + if (!trace_enum_maps) + trace_enum_maps = map_array; + else { + ptr = trace_enum_maps; + for (;;) { + ptr = trace_enum_jmp_to_tail(ptr); + if (!ptr->tail.next) + break; + ptr = ptr->tail.next; + + } + ptr->tail.next = map_array; + } + map_array->head.mod = mod; + map_array->head.length = len; + map_array++; + + for (map = start; (unsigned long)map < (unsigned long)stop; map++) { + map_array->map = **map; + map_array++; + } + memset(map_array, 0, sizeof(*map_array)); + + mutex_unlock(&trace_enum_mutex); +} + +static void trace_create_enum_file(struct dentry *d_tracer) +{ + trace_create_file("enum_map", 0444, d_tracer, + NULL, &tracing_enum_map_fops); +} + +#else /* CONFIG_TRACE_ENUM_MAP_FILE */ +static inline void trace_create_enum_file(struct dentry *d_tracer) { } +static inline void trace_insert_enum_map_file(struct module *mod, + struct trace_enum_map **start, int len) { } +#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ + +static void trace_insert_enum_map(struct module *mod, + struct trace_enum_map **start, int len) +{ + struct trace_enum_map **map; + + if (len <= 0) + return; + + map = start; + + trace_event_enum_update(map, len); + + trace_insert_enum_map_file(mod, start, len); +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[MAX_TRACER_SIZE+2]; + int r; + + mutex_lock(&trace_types_lock); + r = sprintf(buf, "%s\n", tr->current_trace->name); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +int tracer_init(struct tracer *t, struct trace_array *tr) +{ + tracing_reset_online_cpus(&tr->trace_buffer); + return t->init(tr); +} + +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) +{ + int cpu; + + for_each_tracing_cpu(cpu) + per_cpu_ptr(buf->data, cpu)->entries = val; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) +{ + int ret; + + /* + * If kernel or user changes the size of the ring buffer + * we use the size that was given, and we can forget about + * expanding it later. + */ + ring_buffer_expanded = true; + + /* May be called before buffers are initialized */ + if (!tr->trace_buffer.buffer) + return 0; + + ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); + if (ret < 0) + return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || + !tr->current_trace->use_max_tr) + goto out; + + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); + if (ret < 0) { + int r = resize_buffer_duplicate_size(&tr->trace_buffer, + &tr->trace_buffer, cpu); + if (r < 0) { + /* + * AARGH! We are left with different + * size max buffer!!!! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the size of the main buffer, but failed to + * update the size of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + WARN_ON(1); + tracing_disabled = 1; + } + return ret; + } + + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&tr->max_buffer, size); + else + per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + + out: +#endif /* CONFIG_TRACER_MAX_TRACE */ + + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&tr->trace_buffer, size); + else + per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; + + return ret; +} + +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) +{ + int ret = size; + + mutex_lock(&trace_types_lock); + + if (cpu_id != RING_BUFFER_ALL_CPUS) { + /* make sure, this cpu is enabled in the mask */ + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { + ret = -EINVAL; + goto out; + } + } + + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); + if (ret < 0) + ret = -ENOMEM; + +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + + +/** + * tracing_update_buffers - used by tracing facility to expand ring buffers + * + * To save on memory when the tracing is never used on a system with it + * configured in. The ring buffers are set to a minimum size. But once + * a user starts to use the tracing facility, then they need to grow + * to their default size. + * + * This function is to be called when a tracer is about to be used. + */ +int tracing_update_buffers(void) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + RING_BUFFER_ALL_CPUS); + mutex_unlock(&trace_types_lock); + + return ret; +} + +struct trace_option_dentry; + +static struct trace_option_dentry * +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); + +static void +destroy_trace_option_files(struct trace_option_dentry *topts); + +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + +static void update_tracer_options(struct trace_array *tr, struct tracer *t) +{ + static struct trace_option_dentry *topts; + + /* Only enable if the directory has been created already. */ + if (!tr->dir) + return; + + /* Currently, only the top instance has options */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return; + + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) +{ + struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE + bool had_max_tr; +#endif + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (!ring_buffer_expanded) { + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; + ret = 0; + } + + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == tr->current_trace) + goto out; + + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->current_trace->ref) { + ret = -EBUSY; + goto out; + } + + trace_branch_disable(); + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + /* Current trace needs to be nop_trace before synchronize_sched */ + tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->allocated_snapshot; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); + free_snapshot(tr); + } +#endif + update_tracer_options(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr && !had_max_tr) { + ret = alloc_snapshot(tr); + if (ret < 0) + goto out; + } +#endif + + if (t->init) { + ret = tracer_init(t, tr); + if (ret) + goto out; + } + + tr->current_trace = t; + tr->current_trace->enabled++; + trace_branch_enable(tr); + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[MAX_TRACER_SIZE+1]; + int i; + size_t ret; + int err; + + ret = cnt; + + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + err = tracing_set_tracer(tr, buf); + if (err) + return err; + + *ppos += ret; + + return ret; +} + +static ssize_t +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + goto out; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + goto out; + } + + ret = cnt; +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); +} + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int ret = 0; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + mutex_lock(&trace_types_lock); + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + ret = -ENOMEM; + __trace_array_put(tr); + goto out; + } + + trace_seq_init(&iter->seq); + iter->trace = tr->current_trace; + + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + ret = -ENOMEM; + goto fail; + } + + /* trace pipe does not show start of buffer */ + cpumask_setall(iter->started); + + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + iter->tr = tr; + iter->trace_buffer = &tr->trace_buffer; + iter->cpu_file = tracing_get_cpu(inode); + mutex_init(&iter->mutex); + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + + nonseekable_open(inode, filp); + + tr->current_trace->ref++; +out: + mutex_unlock(&trace_types_lock); + return ret; + +fail: + kfree(iter->trace); + kfree(iter); + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); + return ret; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + struct trace_array *tr = inode->i_private; + + mutex_lock(&trace_types_lock); + + tr->current_trace->ref--; + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + + mutex_unlock(&trace_types_lock); + + free_cpumask_var(iter->started); + mutex_destroy(&iter->mutex); + kfree(iter); + + trace_array_put(tr); + + return 0; +} + +static unsigned int +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) +{ + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return POLLIN | POLLRDNORM; + + if (trace_flags & TRACE_ITER_BLOCK) + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + else + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, + filp, poll_table); +} + +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + return trace_poll(iter, filp, poll_table); +} + +/* Must be called with iter->mutex held. */ +static int tracing_wait_pipe(struct file *filp) +{ + struct trace_iterator *iter = filp->private_data; + int ret; + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + return -EAGAIN; + } + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracing_is_on() && iter->pos) + break; + + mutex_unlock(&iter->mutex); + + ret = wait_on_pipe(iter, false); + + mutex_lock(&iter->mutex); + + if (ret) + return ret; + } + + return 1; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + + trace_seq_init(&iter->seq); + + /* + * Avoid more than one consumer on a single file descriptor + * This is just a matter of traces coherency, the ring buffer itself + * is protected. + */ + mutex_lock(&iter->mutex); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + +waitagain: + sret = tracing_wait_pipe(filp); + if (sret <= 0) + goto out; + + /* stop when tracing is finished */ + if (trace_empty(iter)) { + sret = 0; + goto out; + } + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + cpumask_clear(iter->started); + iter->pos = -1; + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + while (trace_find_next_entry_inc(iter) != NULL) { + enum print_line_t ret; + int save_len = iter->seq.seq.len; + + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + /* don't print partial lines */ + iter->seq.seq.len = save_len; + break; + } + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + + if (trace_seq_used(&iter->seq) >= cnt) + break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); + } + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + trace_seq_init(&iter->seq); + + /* + * If there was nothing to send to user, in spite of consuming trace + * entries, go back to wait for more entries. + */ + if (sret == -EBUSY) + goto waitagain; + +out: + mutex_unlock(&iter->mutex); + + return sret; +} + +static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, + unsigned int idx) +{ + __free_page(spd->pages[idx]); +} + +static const struct pipe_buf_operations tracing_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static size_t +tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) +{ + size_t count; + int save_len; + int ret; + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { + save_len = iter->seq.seq.len; + ret = print_trace_line(iter); + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; + break; + } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; + break; + } + + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + rem -= count; + if (!trace_find_next_entry_inc(iter)) { + rem = 0; + iter->ent = NULL; + break; + } + } + + return rem; +} + +static ssize_t tracing_splice_read_pipe(struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct trace_iterator *iter = filp->private_data; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &tracing_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, + }; + ssize_t ret; + size_t rem; + unsigned int i; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + mutex_lock(&iter->mutex); + + if (iter->trace->splice_read) { + ret = iter->trace->splice_read(iter, filp, + ppos, pipe, len, flags); + if (ret) + goto out_err; + } + + ret = tracing_wait_pipe(filp); + if (ret <= 0) + goto out_err; + + if (!iter->ent && !trace_find_next_entry_inc(iter)) { + ret = -EFAULT; + goto out_err; + } + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + + /* Fill as many pages as possible. */ + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) + break; + + rem = tracing_fill_pipe_page(rem, iter); + + /* Copy the data into the page, so we can start over. */ + ret = trace_seq_to_buffer(&iter->seq, + page_address(spd.pages[i]), + trace_seq_used(&iter->seq)); + if (ret < 0) { + __free_page(spd.pages[i]); + break; + } + spd.partial[i].offset = 0; + spd.partial[i].len = trace_seq_used(&iter->seq); + + trace_seq_init(&iter->seq); + } + + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + mutex_unlock(&iter->mutex); + + spd.nr_pages = i; + + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(&spd); + return ret; + +out_err: + mutex_unlock(&iter->mutex); + goto out; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + char buf[64]; + int r = 0; + ssize_t ret; + + mutex_lock(&trace_types_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) { + int cpu, buf_size_same; + unsigned long size; + + size = 0; + buf_size_same = 1; + /* check if all cpu sizes are same */ + for_each_tracing_cpu(cpu) { + /* fill in the size from first enabled cpu */ + if (size == 0) + size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { + buf_size_same = 0; + break; + } + } + + if (buf_size_same) { + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + size >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", size >> 10); + } else + r = sprintf(buf, "X\n"); + } else + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); + + mutex_unlock(&trace_types_lock); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + return ret; +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + /* value is in KB */ + val <<= 10; + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; + + return cnt; +} + +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + /* disable tracing ? */ + if (trace_flags & TRACE_ITER_STOP_ON_FREE) + tracer_tracing_off(tr); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + + trace_array_put(tr); + + return 0; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + unsigned long addr = (unsigned long)ubuf; + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + void *map_page[2]; + int nr_pages = 1; + ssize_t written; + int offset; + int size; + int len; + int ret; + int i; + + if (tracing_disabled) + return -EINVAL; + + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; + } + + for (i = 0; i < nr_pages; i++) + map_page[i] = kmap_atomic(pages[i]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; + } + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, map_page[0] + offset, len); + memcpy(&entry->buf[len], map_page[1], cnt - len); + } else + memcpy(&entry->buf, map_page[0] + offset, cnt); + + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + __buffer_unlock_commit(buffer, event); + + written = cnt; + + *fpos += written; + + out_unlock: + for (i = nr_pages - 1; i >= 0; i--) { + kunmap_atomic(map_page[i]); + put_page(pages[i]); + } + out: + return written; +} + +static int tracing_clock_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + seq_printf(m, + "%s%s%s%s", i ? " " : "", + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); + seq_putc(m, '\n'); + + return 0; +} + +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + tr->clock_id = i; + + ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); + + /* + * New clock may not be consistent with the previous clock. + * Reset the buffer so that it doesn't have incomparable timestamps. + */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&tr->max_buffer); +#endif + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + + *fpos += cnt; + + return cnt; +} + +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int read; +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret = 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + goto out; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->trace_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; + } +out: + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif + if (!tr->allocated_snapshot) { + ret = alloc_snapshot(tr); + if (ret < 0) + break; + } + local_irq_disable(); + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + update_max_tr(tr, current, smp_processor_id()); + else + update_max_tr_single(tr, current, iter->cpu_file); + local_irq_enable(); + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset(&tr->max_buffer, iter->cpu_file); + } + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.trace_buffer = &info->iter.tr->max_buffer; + + return ret; +} + +#endif /* CONFIG_TRACER_SNAPSHOT */ + + +static const struct file_operations tracing_thresh_fops = { + .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .splice_read = tracing_splice_read_pipe, + .release = tracing_release_pipe, + .llseek = no_llseek, +}; + +static const struct file_operations tracing_entries_fops = { + .open = tracing_open_generic_tr, + .read = tracing_entries_read, + .write = tracing_entries_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic_tr, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + +static const struct file_operations tracing_mark_fops = { + .open = tracing_open_generic_tr, + .write = tracing_mark_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations trace_clock_fops = { + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, + .write = tracing_clock_write, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, +}; + +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + struct ftrace_buffer_info *info; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + trace_array_put(tr); + return -ENOMEM; + } + + mutex_lock(&trace_types_lock); + + info->iter.tr = tr; + info->iter.cpu_file = tracing_get_cpu(inode); + info->iter.trace = tr->current_trace; + info->iter.trace_buffer = &tr->trace_buffer; + info->spare = NULL; + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + + filp->private_data = info; + + tr->current_trace->ref++; + + mutex_unlock(&trace_types_lock); + + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + ssize_t ret; + ssize_t size; + + if (!count) + return 0; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; +#endif + + if (!info->spare) + info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + iter->cpu_file); + if (!info->spare) + return -ENOMEM; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + again: + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->trace_buffer->buffer, + &info->spare, + count, + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); + + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, false); + if (ret) + return ret; + + goto again; + } + return 0; + } + + info->read = 0; + read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret == size) + return -EFAULT; + + size -= ret; + + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + mutex_lock(&trace_types_lock); + + iter->tr->current_trace->ref--; + + __trace_array_put(iter->tr); + + if (info->spare) + ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + kfree(info); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +struct buffer_ref { + struct ring_buffer *buffer; + void *page; + int ref; +}; + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + buf->private = 0; +} + +static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + ref->ref++; +} + +/* Pipe buffer operations for a buffer. */ +static const struct pipe_buf_operations buffer_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = buffer_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int entries, size, i; + ssize_t ret = 0; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; +#endif + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + if (*ppos & (PAGE_SIZE - 1)) + return -EINVAL; + + if (len & (PAGE_SIZE - 1)) { + if (len < PAGE_SIZE) + return -EINVAL; + len &= PAGE_MASK; + } + + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + ret = -ENOMEM; + break; + } + + ref->ref = 1; + ref->buffer = iter->trace_buffer->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); + if (!ref->page) { + ret = -ENOMEM; + kfree(ref); + break; + } + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, iter->cpu_file, 1); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + break; + } + + /* + * zero out any left over data, this is going to + * user land. + */ + size = ring_buffer_page_len(ref->page); + if (size < PAGE_SIZE) + memset(ref->page + size, 0, PAGE_SIZE - size); + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + } + + trace_access_unlock(iter->cpu_file); + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + if (ret) + return ret; + + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, true); + if (ret) + return ret; + + goto again; + } + + ret = splice_to_pipe(pipe, &spd); + splice_shrink_spd(&spd); + + return ret; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .poll = tracing_buffers_poll, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + struct trace_buffer *trace_buf = &tr->trace_buffer; + int cpu = tracing_get_cpu(inode); + struct trace_seq *s; + unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + if (trace_clocks[tr->clock_id].in_ns) { + /* local or global for trace_clock */ + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", + t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + } else { + /* counter or tsc mode for trace_clock */ + trace_seq_printf(s, "oldest event ts: %llu\n", + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + + trace_seq_printf(s, "now ts: %llu\n", + ring_buffer_time_stamp(trace_buf->buffer, cpu)); + } + + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "dropped events: %ld\n", cnt); + + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic_tr, + .read = tracing_stats_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + +static ssize_t +tracing_read_dyn_info(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); + unsigned long *p = filp->private_data; + char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + int r; + + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); + + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; +} + +static const struct file_operations tracing_dyn_info_fops = { + .open = tracing_open_generic, + .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, +}; +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + tracing_snapshot(); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_puts(m, "snapshot"); + + if (count == -1) + seq_puts(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + if (ret >= 0) + alloc_snapshot(&global_trace); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static __init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline __init int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +static struct dentry *tracing_get_dentry(struct trace_array *tr) +{ + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + + /* Top directory uses NULL as the parent */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return NULL; + + /* All sub buffers have a descriptor */ + return tr->dir; +} + +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) +{ + struct dentry *d_tracer; + + if (tr->percpu_dir) + return tr->percpu_dir; + + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) + return NULL; + + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); + + WARN_ONCE(!tr->percpu_dir, + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); + + return tr->percpu_dir; +} + +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) +{ + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + d_inode(ret)->i_cdev = (void *)(cpu + 1); + return ret; +} + +static void +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) +{ + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); + struct dentry *d_cpu; + char cpu_dir[30]; /* 30 characters should be more than enough */ + + if (!d_percpu) + return; + + snprintf(cpu_dir, 30, "cpu%ld", cpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); + if (!d_cpu) { + pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + return; + } + + /* per cpu trace_pipe */ + trace_create_cpu_file("trace_pipe", 0444, d_cpu, + tr, cpu, &tracing_pipe_fops); + + /* per cpu trace */ + trace_create_cpu_file("trace", 0644, d_cpu, + tr, cpu, &tracing_fops); + + trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + tr, cpu, &tracing_buffers_fops); + + trace_create_cpu_file("stats", 0444, d_cpu, + tr, cpu, &tracing_stats_fops); + + trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + tr, cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_cpu_file("snapshot", 0644, d_cpu, + tr, cpu, &snapshot_fops); + + trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + tr, cpu, &snapshot_raw_fops); +#endif +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct trace_array *tr; + struct dentry *entry; +}; + +static ssize_t +trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + char *buf; + + if (topt->flags->val & topt->opt->bit) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + if (!!(topt->flags->val & topt->opt->bit) != val) { + mutex_lock(&trace_types_lock); + ret = __set_tracer_option(topt->tr, topt->flags, + topt->opt, !val); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + } + + *ppos += cnt; + + return cnt; +} + + +static const struct file_operations trace_options_fops = { + .open = tracing_open_generic, + .read = trace_options_read, + .write = trace_options_write, + .llseek = generic_file_llseek, +}; + +static ssize_t +trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + long index = (long)filp->private_data; + char *buf; + + if (trace_flags & (1 << index)) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + long index = (long)filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + mutex_lock(&trace_types_lock); + ret = set_tracer_flag(tr, 1 << index, val); + mutex_unlock(&trace_types_lock); + + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations trace_options_core_fops = { + .open = tracing_open_generic, + .read = trace_options_core_read, + .write = trace_options_core_write, + .llseek = generic_file_llseek, +}; + +struct dentry *trace_create_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = tracefs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create tracefs '%s' entry\n", name); + + return ret; +} + + +static struct dentry *trace_options_init_dentry(struct trace_array *tr) +{ + struct dentry *d_tracer; + + if (tr->options) + return tr->options; + + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) + return NULL; + + tr->options = tracefs_create_dir("options", d_tracer); + if (!tr->options) { + pr_warning("Could not create tracefs directory 'options'\n"); + return NULL; + } + + return tr->options; +} + +static void +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, + struct tracer_flags *flags, + struct tracer_opt *opt) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return; + + topt->flags = flags; + topt->opt = opt; + topt->tr = tr; + + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, + &trace_options_fops); + +} + +static struct trace_option_dentry * +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +{ + struct trace_option_dentry *topts; + struct tracer_flags *flags; + struct tracer_opt *opts; + int cnt; + + if (!tracer) + return NULL; + + flags = tracer->flags; + + if (!flags || !flags->opts) + return NULL; + + opts = flags->opts; + + for (cnt = 0; opts[cnt].name; cnt++) + ; + + topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); + if (!topts) + return NULL; + + for (cnt = 0; opts[cnt].name; cnt++) + create_trace_option_file(tr, &topts[cnt], flags, + &opts[cnt]); + + return topts; +} + +static void +destroy_trace_option_files(struct trace_option_dentry *topts) +{ + int cnt; + + if (!topts) + return; + + for (cnt = 0; topts[cnt].opt; cnt++) + tracefs_remove(topts[cnt].entry); + + kfree(topts); +} + +static struct dentry * +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return NULL; + + return trace_create_file(option, 0644, t_options, (void *)index, + &trace_options_core_fops); +} + +static __init void create_trace_options_dir(struct trace_array *tr) +{ + struct dentry *t_options; + int i; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return; + + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(tr, trace_options[i], i); +} + +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tracer_tracing_is_on(tr); + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (buffer) { + mutex_lock(&trace_types_lock); + if (val) { + tracer_tracing_on(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); + } else { + tracer_tracing_off(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); + } + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic_tr, + .read = rb_simple_read, + .write = rb_simple_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +struct dentry *trace_instance_dir; + +static void +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ + enum ring_buffer_flags rb_flags; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + buf->tr = tr; + + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; + + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + return -ENOMEM; + } + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->trace_buffer, + ring_buffer_size(tr->trace_buffer.buffer, 0)); + + return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; + + ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + if (ret) + return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (WARN_ON(ret)) { + ring_buffer_free(tr->trace_buffer.buffer); + free_percpu(tr->trace_buffer.data); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; + + /* + * Only the top level trace array gets its snapshot allocated + * from the kernel command line. + */ + allocate_snapshot = false; +#endif + return 0; +} + +static void free_trace_buffer(struct trace_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + free_trace_buffer(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + free_trace_buffer(&tr->max_buffer); +#endif +} + +static int instance_mkdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out_unlock; + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&tr->start_lock); + + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + + if (allocate_trace_buffers(tr, trace_buf_size) < 0) + goto out_free_tr; + + tr->dir = tracefs_create_dir(name, trace_instance_dir); + if (!tr->dir) + goto out_free_tr; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) { + tracefs_remove_recursive(tr->dir); + goto out_free_tr; + } + + init_tracer_tracefs(tr, tr->dir); + + list_add(&tr->list, &ftrace_trace_arrays); + + mutex_unlock(&trace_types_lock); + + return 0; + + out_free_tr: + free_trace_buffers(tr); + free_cpumask_var(tr->tracing_cpumask); + kfree(tr->name); + kfree(tr); + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; + +} + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int found = 0; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + found = 1; + break; + } + } + if (!found) + goto out_unlock; + + ret = -EBUSY; + if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + goto out_unlock; + + list_del(&tr->list); + + tracing_set_nop(tr); + event_trace_del_tracer(tr); + ftrace_destroy_function_files(tr); + debugfs_remove_recursive(tr->dir); + free_trace_buffers(tr); + + kfree(tr->name); + kfree(tr); + + ret = 0; + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); + if (WARN_ON(!trace_instance_dir)) + return; +} + +static void +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) +{ + int cpu; + + trace_create_file("available_tracers", 0444, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + tr, &set_tracer_fops); + + trace_create_file("tracing_cpumask", 0644, d_tracer, + tr, &tracing_cpumask_fops); + + trace_create_file("trace_options", 0644, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", 0644, d_tracer, + tr, &tracing_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, + tr, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + tr, &tracing_entries_fops); + + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + tr, &tracing_total_entries_fops); + + trace_create_file("free_buffer", 0200, d_tracer, + tr, &tracing_free_buffer_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + tr, &tracing_mark_fops); + + trace_create_file("trace_clock", 0644, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", 0644, d_tracer, + tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + + if (ftrace_create_function_files(tr, d_tracer)) + WARN(1, "Could not allocate function filter files"); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + tr, &snapshot_fops); +#endif + + for_each_tracing_cpu(cpu) + tracing_init_tracefs_percpu(tr, cpu); + +} + +static struct vfsmount *trace_automount(void *ingore) +{ + struct vfsmount *mnt; + struct file_system_type *type; + + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; + mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + return NULL; + mntget(mnt); + + return mnt; +} + +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +struct dentry *tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; + + /* The top level trace array uses NULL as parent */ + if (tr->dir) + return NULL; + + if (WARN_ON(!debugfs_initialized())) + return ERR_PTR(-ENODEV); + + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kerenl. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); + if (!tr->dir) { + pr_warn_once("Could not create debugfs directory 'tracing'\n"); + return ERR_PTR(-ENOMEM); + } + + return NULL; +} + +extern struct trace_enum_map *__start_ftrace_enum_maps[]; +extern struct trace_enum_map *__stop_ftrace_enum_maps[]; + +static void __init trace_enum_init(void) +{ + int len; + + len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; + trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); +} + +#ifdef CONFIG_MODULES +static void trace_module_add_enums(struct module *mod) +{ + if (!mod->num_trace_enums) + return; + + /* + * Modules with bad taint do not have events created, do + * not bother with enums either. + */ + if (trace_module_has_bad_taint(mod)) + return; + + trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); +} + +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +static void trace_module_remove_enums(struct module *mod) +{ + union trace_enum_map_item *map; + union trace_enum_map_item **last = &trace_enum_maps; + + if (!mod->num_trace_enums) + return; + + mutex_lock(&trace_enum_mutex); + + map = trace_enum_maps; + + while (map) { + if (map->head.mod == mod) + break; + map = trace_enum_jmp_to_tail(map); + last = &map->tail.next; + map = map->tail.next; + } + if (!map) + goto out; + + *last = trace_enum_jmp_to_tail(map)->tail.next; + kfree(map); + out: + mutex_unlock(&trace_enum_mutex); +} +#else +static inline void trace_module_remove_enums(struct module *mod) { } +#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_enums(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_enums(mod); + break; + } + + return 0; +} + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; +#endif /* CONFIG_MODULES */ + +static __init int tracer_init_tracefs(void) +{ + struct dentry *d_tracer; + + trace_access_lock_init(); + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + init_tracer_tracefs(&global_trace, d_tracer); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &global_trace, &tracing_thresh_fops); + + trace_create_file("README", 0444, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); + + trace_enum_init(); + + trace_create_enum_file(d_tracer); + +#ifdef CONFIG_MODULES + register_module_notifier(&trace_module_nb); +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); +#endif + + create_trace_instances(d_tracer); + + create_trace_options_dir(&global_trace); + + /* If the tracer was started via cmdline, create options for it here */ + if (global_trace.current_trace != &nop_trace) + update_tracer_options(&global_trace, global_trace.current_trace); + + return 0; +} + +static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + if (ftrace_dump_on_oops) + ftrace_dump(ftrace_dump_on_oops); + return NOTIFY_OK; +} + +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; + +static int trace_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + switch (val) { + case DIE_OOPS: + if (ftrace_dump_on_oops) + ftrace_dump(ftrace_dump_on_oops); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_handler, + .priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_EMERG + +void +trace_printk_seq(struct trace_seq *s) +{ + /* Probably should print a warning here. */ + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; + + /* should be zero ended, but we are paranoid. */ + s->buffer[s->seq.len] = 0; + + printk(KERN_TRACE "%s", s->buffer); + + trace_seq_init(s); +} + +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = iter->tr->current_trace; + iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->trace_buffer = &global_trace.trace_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + static atomic_t dump_running; + unsigned int old_userobj; + unsigned long flags; + int cnt = 0, cpu; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ + tracing_off(); + + local_irq_save(flags); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + } + + old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = RING_BUFFER_ALL_CPUS; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = RING_BUFFER_ALL_CPUS; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (trace_find_next_entry_inc(&iter) != NULL) { + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); + } + touch_nmi_watchdog(); + + trace_printk_seq(&iter.seq); + } + + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + out_enable: + trace_flags |= old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ftrace_dump); + +__init static int tracer_alloc_buffers(void) +{ + int ring_buf_size; + int ret = -ENOMEM; + + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + + /* Only allocate trace_printk buffers if a trace_printk exists */ + if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + /* Must be called before global_trace.buffer is allocated */ + trace_printk_init_buffers(); + + /* To save memory, keep the ring buffer size to its minimum */ + if (ring_buffer_expanded) + ring_buf_size = trace_buf_size; + else + ring_buf_size = 1; + + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&global_trace.start_lock); + + /* Used for event triggers */ + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_free_cpumask; + + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); + WARN_ON(1); + goto out_free_savedcmd; + } + + if (global_trace.buffer_disabled) + tracing_off(); + + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warning("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } + + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ + global_trace.current_trace = &nop_trace; + + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + ftrace_init_global_array_ops(&global_trace); + + register_tracer(&nop_trace); + + /* All seems OK, enable tracing */ + tracing_disabled = 0; + + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); + + register_die_notifier(&trace_die_notifier); + + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + list_add(&global_trace.list, &ftrace_trace_arrays); + + while (trace_boot_options) { + char *option; + + option = strsep(&trace_boot_options, ","); + trace_set_options(&global_trace, option); + } + + register_snapshot_cmd(); + + return 0; + +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); +out_free_temp_buffer: + ring_buffer_free(temp_buffer); +out_free_cpumask: + free_cpumask_var(global_trace.tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; +} + +void __init trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (WARN_ON(!tracepoint_print_iter)) + tracepoint_printk = 0; + } + tracer_alloc_buffers(); + trace_event_init(); +} + +__init static int clear_boot_tracer(void) +{ + /* + * The default tracer at boot buffer is an init section. + * This function is called in lateinit. If we did not + * find the boot tracer, then clear it out, to prevent + * later registration from accessing the buffer that is + * about to be freed. + */ + if (!default_bootup_tracer) + return 0; + + printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", + default_bootup_tracer); + default_bootup_tracer = NULL; + + return 0; +} + +fs_initcall(tracer_init_tracefs); +late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 000000000..921691c5c --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,1321 @@ + +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include <linux/fs.h> +#include <linux/atomic.h> +#include <linux/sched.h> +#include <linux/clocksource.h> +#include <linux/ring_buffer.h> +#include <linux/mmiotrace.h> +#include <linux/tracepoint.h> +#include <linux/ftrace.h> +#include <linux/hw_breakpoint.h> +#include <linux/trace_seq.h> +#include <linux/ftrace_event.h> +#include <linux/compiler.h> +#include <linux/trace_seq.h> + +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/syscall.h> /* some archs define it here */ +#endif + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_PRINT, + TRACE_BPRINT, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + TRACE_BRANCH, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_BLK, + TRACE_BPUTS, + + __TRACE_LAST_TYPE, +}; + + +#undef __field +#define __field(type, item) type item; + +#undef __field_struct +#define __field_struct(type, item) __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } + +#undef TP_ARGS +#define TP_ARGS(args...) args + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + +#include "trace_entries.h" + +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ +struct syscall_trace_enter { + struct trace_entry ent; + int nr; + unsigned long args[]; +}; + +struct syscall_trace_exit { + struct trace_entry ent; + int nr; + long ret; +}; + +struct kprobe_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct kretprobe_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * NEED_RESCHED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_IRQS_NOSUPPORT = 0x02, + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, +}; + +#define TRACE_BUF_SIZE 1024 + +struct trace_array; + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + atomic_t disabled; + void *buffer_page; /* ring buffer spare */ + + unsigned long entries; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long skipped_entries; + cycle_t preempt_timestamp; + pid_t pid; + kuid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct tracer; + +struct trace_buffer { + struct trace_array *tr; + struct ring_buffer *buffer; + struct trace_array_cpu __percpu *data; + cycle_t time_start; + int cpu; +}; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + struct list_head list; + char *name; + struct trace_buffer trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the trace_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the trace_buffer and the buffers are reset for + * the trace_buffer so the tracing can continue. + */ + struct trace_buffer max_buffer; + bool allocated_snapshot; + unsigned long max_latency; +#endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; + int buffer_disabled; +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; +#endif + int stop_count; + int clock_id; + struct tracer *current_trace; + unsigned int flags; + raw_spinlock_t start_lock; + struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; + struct dentry *event_dir; + struct list_head systems; + struct list_head events; + cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + int ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + /* function tracing enabled */ + int function_enabled; +#endif +}; + +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) +}; + +extern struct list_head ftrace_trace_arrays; + +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + if (list_empty(&ftrace_trace_arrays)) + return NULL; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + __ftrace_bad_type(); \ + } while (0) + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + + +/** + * struct tracer - a specific tracer and its callbacks to interact with tracefs + * @name: the name chosen to select it on the available_tracers file + * @init: called when one switches to this tracer (echo name > current_tracer) + * @reset: called when one switches to another tracer + * @start: called when tracing is unpaused (echo 1 > tracing_enabled) + * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @update_thresh: called when tracing_thresh is updated + * @open: called when the trace file is opened + * @pipe_open: called when the trace_pipe file is opened + * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released + * @read: override the default read callback on trace_pipe + * @splice_read: override the default splice_read callback on trace_pipe + * @selftest: selftest to run on boot (see trace_selftest.c) + * @print_headers: override the first lines that describe your columns + * @print_line: callback that prints a trace + * @set_flag: signals one of your private flags changed (trace_options file) + * @flags: your private flags + */ +struct tracer { + const char *name; + int (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); + int (*update_thresh)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + ssize_t (*splice_read)(struct trace_iterator *iter, + struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + void (*print_header)(struct seq_file *m); + enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct trace_array *tr, + u32 mask, int set); + struct tracer *next; + struct tracer_flags *flags; + int enabled; + int ref; + bool print_max; + bool allow_instances; +#ifdef CONFIG_TRACER_MAX_TRACE + bool use_max_tr; +#endif +}; + + +/* Only current can touch trace_recursion */ + +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + + /* INTERNAL_BITs must be greater than FTRACE_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, + + TRACE_BRANCH_BIT, +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ + TRACE_IRQ_BIT, +}; + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) + +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + +int tracer_init(struct tracer *t, struct trace_array *tr); +int tracing_is_enabled(void); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf); +void tracing_reset_current(int cpu); +void tracing_reset_all_online_cpus(void); +int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); +struct dentry *trace_create_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + +struct dentry *tracing_init_dentry(void); + +struct ring_buffer_event; + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, + int pc); + +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); + +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts); + +void __buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + +void trace_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); +void trace_default_header(struct seq_file *m); +void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +int trace_empty(struct trace_iterator *iter); + +void trace_graph_return(struct ftrace_graph_ret *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); + +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +int register_tracer(struct tracer *type); +int is_tracing_stopped(void); + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); + +extern cpumask_var_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ + +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc); + +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs); + +void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, + int pc); + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, + unsigned long flags, int skip, + int pc, struct pt_regs *regs) +{ +} + +static inline void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ + +extern cycle_t ftrace_now(int cpu); + +extern void trace_find_cmdline(int pid, char comm[]); + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#endif +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); + +extern bool ring_buffer_expanded; +extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(int, ftrace_cpu_disabled); + +#ifdef CONFIG_FTRACE_STARTUP_TEST +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern unsigned long long ns2usecs(cycle_t nsec); +extern int +trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); + +extern unsigned long trace_flags; + +extern char trace_find_mark(unsigned long long duration); + +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) + +extern enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags); +extern void print_graph_headers_flags(struct seq_file *s, u32 flags); +extern void +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); +extern void graph_trace_open(struct trace_iterator *iter); +extern void graph_trace_close(struct trace_iterator *iter); +extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, int pc); +extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, int pc); + + +#ifdef CONFIG_DYNAMIC_FTRACE +/* TODO: make this variable */ +#define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_count; +extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; + +static inline int ftrace_graph_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_count) + return 1; + + for (i = 0; i < ftrace_graph_count; i++) { + if (addr == ftrace_graph_funcs[i]) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + return 1; + } + } + + return 0; +} + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_notrace_count) + return 0; + + for (i = 0; i < ftrace_graph_notrace_count; i++) { + if (addr == ftrace_graph_notrace_funcs[i]) + return 1; + } + + return 0; +} +#else +static inline int ftrace_graph_addr(unsigned long addr) +{ + return 1; +} + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static inline enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +extern struct list_head ftrace_pids; + +#ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata; +static inline int ftrace_trace_task(struct task_struct *task) +{ + if (list_empty(&ftrace_pids)) + return 1; + + return test_tsk_trace_trace(task); +} +extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +static inline int ftrace_is_dead(void) { return 0; } +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ + +int ftrace_event_is_function(struct ftrace_event_call *call); + +/* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input length + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, + TRACE_ITER_STOP_ON_FREE = 0x400000, + TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, + TRACE_ITER_FUNCTION = 0x2000000, +}; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +extern struct tracer nop_trace; + +#ifdef CONFIG_BRANCH_TRACER +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) +{ + if (trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); + return 0; +} +static inline void trace_branch_disable(void) +{ + /* due to races, always disable */ + disable_branch_tracing(); +} +#else +static inline int trace_branch_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_branch_disable(void) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +/* set ring buffers to default size if not already done so */ +int tracing_update_buffers(void); + +struct ftrace_event_field { + struct list_head link; + const char *name; + const char *type; + int filter_type; + int offset; + int size; + int is_signed; +}; + +struct event_filter { + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; + char *filter_string; +}; + +struct event_subsystem { + struct list_head list; + const char *name; + struct event_filter *filter; + int ref_count; +}; + +struct ftrace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct dentry *entry; + int ref_count; + int nr_events; +}; + +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + +struct filter_pred; +struct regex; + +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + struct regex regex; + unsigned short *ops; + struct ftrace_event_field *field; + int offset; + int not; + int op; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; +}; + +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); +extern void print_event_filter(struct ftrace_event_file *file, + struct trace_seq *s); +extern int apply_event_filter(struct ftrace_event_file *file, + char *filter_string); +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s); +extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); + +extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); + +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); + +static inline void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + +extern struct mutex event_mutex; +extern struct list_head ftrace_events; + +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { + unsigned long count; + int ref; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + struct list_head list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*func)(struct event_trigger_data *data); + int (*init)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + void (*free)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + bool post_trigger; + int (*func)(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *params); + int (*reg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct ftrace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); + +extern const char *__start___trace_bprintk_fmt[]; +extern const char *__stop___trace_bprintk_fmt[]; + +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + +void trace_printk_init_buffers(void); +void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ + extern struct ftrace_event_call \ + __aligned(4) event_##call; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) +#include "trace_entries.h" + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif + +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +#else +static inline void init_ftrace_syscalls(void) { } +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +void trace_event_enum_update(struct trace_enum_map **map, int len); +#else +static inline void __init trace_event_init(void) { } +static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } +#endif + +extern struct trace_iterator *tracepoint_print_iter; + +#endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000..40a14cbcf --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + u64 stddev; + u64 seed; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* These don't need to be reset but reset them anyway */ + bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000..3c1df1df4 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 000000000..1879980f0 --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,414 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/hash.h> +#include <linux/fs.h> +#include <asm/local.h> + +#include "trace.h" +#include "trace_stat.h" +#include "trace_output.h" + +#ifdef CONFIG_BRANCH_TRACER + +static struct tracer branch_trace; +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); + +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + struct ftrace_event_call *call = &event_branch; + struct trace_array *tr = branch_tracer; + struct trace_array_cpu *data; + struct ring_buffer_event *event; + struct trace_branch *entry; + struct ring_buffer *buffer; + unsigned long flags; + int pc; + const char *p; + + if (current->trace_recursion & TRACE_BRANCH_BIT) + return; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + raw_local_irq_save(flags); + current->trace_recursion |= TRACE_BRANCH_BIT; + data = this_cpu_ptr(tr->trace_buffer.data); + if (atomic_read(&data->disabled)) + goto out; + + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, + sizeof(*entry), flags, pc); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + out: + current->trace_recursion &= ~TRACE_BRANCH_BIT; + raw_local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return 0; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static int branch_trace_init(struct trace_array *tr) +{ + start_branch_trace(tr); + return 0; +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +static enum print_line_t trace_branch_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct trace_branch *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + + return trace_handle_return(&iter->seq); +} + +static void branch_print_header(struct seq_file *s) +{ + seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" + " FUNC:FILE:LINE\n" + "# | | | | | " + " |\n"); +} + +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + +static struct trace_event trace_branch_event = { + .type = TRACE_BRANCH, + .funcs = &trace_branch_funcs, +}; + +static struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif /* CONFIG_FTRACE_SELFTEST */ + .print_header = branch_print_header, +}; + +__init static int init_branch_tracer(void) +{ + int ret; + + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "branch events\n"); + return 1; + } + return register_tracer(&branch_trace); +} +core_initcall(init_branch_tracer); + +#else +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +{ + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; + +static int annotated_branch_stat_headers(struct seq_file *m) +{ + seq_puts(m, " correct incorrect % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static inline long get_incorrect_percent(struct ftrace_branch_data *p) +{ + long percent; + + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; + + return percent; +} + +static int branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + long percent; + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + /* + * The miss is overlayed on correct, and hit on incorrect. + */ + percent = get_incorrect_percent(p); + + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) + seq_puts(m, " X "); + else + seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static void *annotated_branch_stat_start(struct tracer_stat *trace) +{ + return __start_annotated_branch_profile; +} + +static void * +annotated_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_annotated_branch_profile) + return NULL; + + return p; +} + +static int annotated_branch_stat_cmp(void *p1, void *p2) +{ + struct ftrace_branch_data *a = p1; + struct ftrace_branch_data *b = p2; + + long percent_a, percent_b; + + percent_a = get_incorrect_percent(a); + percent_b = get_incorrect_percent(b); + + if (percent_a < percent_b) + return -1; + if (percent_a > percent_b) + return 1; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; +} + +static struct tracer_stat annotated_branch_stats = { + .name = "branch_annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show +}; + +__init static int init_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&annotated_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "annotated branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(init_annotated_branch_stats); + +#ifdef CONFIG_PROFILE_ALL_BRANCHES + +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static int all_branch_stat_headers(struct seq_file *m) +{ + seq_puts(m, " miss hit % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static void *all_branch_stat_start(struct tracer_stat *trace) +{ + return __start_branch_profile; +} + +static void * +all_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_branch_profile) + return NULL; + + return p; +} + +static struct tracer_stat all_branch_stats = { + .name = "branch_all", + .stat_start = all_branch_stat_start, + .stat_next = all_branch_stat_next, + .stat_headers = all_branch_stat_headers, + .stat_show = branch_stat_show +}; + +__init static int all_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&all_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "all branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(all_annotated_branch_stats); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c new file mode 100644 index 000000000..57b67b1f2 --- /dev/null +++ b/kernel/trace/trace_clock.c @@ -0,0 +1,137 @@ +/* + * tracing clocks + * + * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Implements 3 trace clock variants, with differing scalability/precision + * tradeoffs: + * + * - local: CPU-local trace clock + * - medium: scalable global clock with some jitter + * - global: globally monotonic, serialized clock + * + * Tracer plugins will chose a default from these clocks. + */ +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/hardirq.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/ktime.h> +#include <linux/trace_clock.h> + +/* + * trace_clock_local(): the simplest and least coherent tracing clock. + * + * Useful for tracing that does not cross to other CPUs nor + * does it go through idle events. + */ +u64 notrace trace_clock_local(void) +{ + u64 clock; + + /* + * sched_clock() is an architecture implemented, fast, scalable, + * lockless clock. It is not guaranteed to be coherent across + * CPUs, nor across CPU idle events. + */ + preempt_disable_notrace(); + clock = sched_clock(); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(trace_clock_local); + +/* + * trace_clock(): 'between' trace clock. Not completely serialized, + * but not completely incorrect when crossing CPUs either. + * + * This is based on cpu_clock(), which will allow at most ~1 jiffy of + * jitter between CPUs. So it's a pretty scalable clock, but there + * can be offsets in the trace data. + */ +u64 notrace trace_clock(void) +{ + return local_clock(); +} + +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. + */ +u64 notrace trace_clock_jiffies(void) +{ + return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); +} + +/* + * trace_clock_global(): special globally coherent trace clock + * + * It has higher overhead than the other trace clocks but is still + * an order of magnitude faster than GTOD derived hardware clocks. + * + * Used by plugins that need globally coherent timestamps. + */ + +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + arch_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, + }; + +u64 notrace trace_clock_global(void) +{ + unsigned long flags; + int this_cpu; + u64 now; + + local_irq_save(flags); + + this_cpu = raw_smp_processor_id(); + now = sched_clock_cpu(this_cpu); + /* + * If in an NMI context then dont risk lockups and return the + * cpu_clock() time: + */ + if (unlikely(in_nmi())) + goto out; + + arch_spin_lock(&trace_clock_struct.lock); + + /* + * TODO: if this happens often then maybe we should reset + * my_scd->clock to prev_time+1, to make sure + * we start ticking with the local clock from now on? + */ + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; + + trace_clock_struct.prev_time = now; + + arch_spin_unlock(&trace_clock_struct.lock); + + out: + local_irq_restore(flags); + + return now; +} + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 000000000..ee7b94a48 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,324 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is laid out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function address: + */ +FTRACE_ENTRY_REG(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + FILTER_TRACE_FN, + + perf_ftrace_event_register +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth), + + FILTER_OTHER +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu), + + FILTER_OTHER +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu), + + FILTER_OTHER +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __field( int, size ) + __dynamic_array(unsigned long, caller ) + ), + + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->fmt), + + FILTER_OTHER +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->buf), + + FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->str), + + FILTER_OTHER +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode), + + FILTER_OTHER +); + + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER +); + diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c new file mode 100644 index 000000000..6fa484de2 --- /dev/null +++ b/kernel/trace/trace_event_perf.c @@ -0,0 +1,384 @@ +/* + * trace event based perf event profiling/tracing + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> + */ + +#include <linux/module.h> +#include <linux/kprobes.h> +#include "trace.h" + +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; + +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; + +/* Count the events in use (per event id, not per instance) */ +static int total_ref_count; + +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + + /* + * We checked and allowed to create parent, + * allow children without checking. + */ + if (p_event->parent) + return 0; + + /* + * It's ok to check current process (owner) permissions in here, + * because code below is called only via perf_event_open syscall. + */ + + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } + + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + struct hlist_head __percpu *list; + int ret = -ENOMEM; + int cpu; + + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) + return 0; + + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; + + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); + + tp_event->perf_events = list; + + if (!total_ref_count) { + char __percpu *buf; + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } + } + + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: + if (!total_ref_count) { + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + } + + return ret; +} + +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + +int perf_trace_init(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event; + u64 event_id = p_event->attr.config; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && tp_event->class->reg && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + +int perf_trace_add(struct perf_event *p_event, int flags) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); +} + +void perf_trace_del(struct perf_event *p_event, int flags) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); +} + +void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs **regs, int *rctxp) +{ + struct trace_entry *entry; + unsigned long flags; + char *raw_data; + int pc; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return NULL; + + pc = preempt_count(); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + return NULL; + + if (regs) + *regs = this_cpu_ptr(&__perf_regs[*rctxp]); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + + /* zero the dead bytes from align to not leak stack to user */ + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + + entry = (struct trace_entry *)raw_data; + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); + entry->type = type; + + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *pt_regs) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + + head = this_cpu_ptr(event_function.perf_events); + if (hlist_empty(head)) + return; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head, NULL); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c new file mode 100644 index 000000000..c4de47fc5 --- /dev/null +++ b/kernel/trace/trace_events.c @@ -0,0 +1,2943 @@ +/* + * event tracer + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * - Added format output of fields of the trace point. + * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. + * + */ + +#define pr_fmt(fmt) fmt + +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/slab.h> +#include <linux/delay.h> + +#include <asm/setup.h> + +#include "trace_output.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM "TRACE_SYSTEM" + +DEFINE_MUTEX(event_mutex); + +LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_common_fields); + +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +#define SYSTEM_FL_FREE_NAME (1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct ftrace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + +static struct list_head * +trace_get_fields(struct ftrace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct ftrace_event_field *field; + + field = kmem_cache_alloc(field_cachep, GFP_TRACE); + if (!field) + return -ENOMEM; + + field->name = name; + field->type = type; + + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + + list_add(&field->link, head); + + return 0; +} + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} +EXPORT_SYMBOL_GPL(trace_define_field); + +#define __common_field(type, item) \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +static int trace_define_common_fields(void) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + + return ret; +} + +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + struct list_head *head; + + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kmem_cache_free(field_cachep, field); + } +} + +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(&call->event); + if (!id) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len) +{ + struct ftrace_event_call *event_call = ftrace_file->event_call; + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); + fbuffer->ftrace_file = ftrace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + event_call->event.type, len, + fbuffer->flags, fbuffer->pc); + if (!fbuffer->event) + return NULL; + + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +static DEFINE_SPINLOCK(tracepoint_iter_lock); + +static void output_printk(struct ftrace_event_buffer *fbuffer) +{ + struct ftrace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + if (!iter) + return; + + event_call = fbuffer->ftrace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->ftrace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ + if (tracepoint_printk) + output_printk(fbuffer); + + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + struct ftrace_event_file *file = data; + + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->tp, + call->class->probe, + file); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + file); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->tp, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + call); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_file *file; + struct trace_array *tr; + + mutex_lock(&event_mutex); + do_for_each_event_file(tr, file) { + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + } else { + tracing_stop_cmdline_record(); + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + } + } while_for_each_event_file(); + mutex_unlock(&event_mutex); +} + +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) +{ + struct ftrace_event_call *call = file->event_call; + int ret = 0; + int disable; + + switch (enable) { + case 0: + /* + * When soft_disable is set and enable is cleared, the sm_ref + * reference counter is decremented. If it reaches 0, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + if (atomic_dec_return(&file->sm_ref) > 0) + break; + disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; + clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } else + disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + + if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { + clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + } + call->class->reg(call, TRACE_REG_UNREGISTER, file); + } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ + if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + break; + case 1: + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else { + if (atomic_inc_return(&file->sm_ref) > 1) + break; + set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); + } + ret = call->class->reg(call, TRACE_REG_REGISTER, file); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", ftrace_event_name(call)); + break; + } + set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + + /* WAS_ENABLED gets set but never cleared. */ + call->flags |= TRACE_EVENT_FL_WAS_ENABLED; + } + break; + } + + return ret; +} + +int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) +{ + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) +{ + return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + mutex_lock(&event_mutex); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); + } + mutex_unlock(&event_mutex); +} + +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) + return; + + list_del(&system->list); + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); +} + +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) +{ + mutex_lock(&event_mutex); + __put_system_dir(dir); + mutex_unlock(&event_mutex); +} + +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + tracefs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void remove_event_file_dir(struct ftrace_event_file *file) +{ + struct dentry *dir = file->dir; + struct dentry *child; + + if (dir) { + spin_lock(&dir->d_lock); /* probably unneeded */ + list_for_each_entry(child, &dir->d_subdirs, d_child) { + if (d_really_is_positive(child)) /* probably unneeded */ + d_inode(child)->i_private = NULL; + } + spin_unlock(&dir->d_lock); + + tracefs_remove_recursive(dir); + } + + list_del(&file->list); + remove_subsystem(file->system); + free_event_filter(file->filter); + kmem_cache_free(file_cachep, file); +} + +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + const char *name; + int ret = -EINVAL; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = ftrace_event_name(call); + + if (!name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (match && + strcmp(match, name) != 0 && + strcmp(match, call->class->system) != 0) + continue; + + if (sub && strcmp(sub, call->class->system) != 0) + continue; + + if (event && strcmp(event, name) != 0) + continue; + + ftrace_event_enable_disable(file, set); + + ret = 0; + } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + mutex_unlock(&event_mutex); + + return ret; +} + +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +{ + char *event = NULL, *sub = NULL, *match; + int ret; + + /* + * The buf format can be <subsystem>:<event-name> + * *:<event-name> means any event by that name. + * :<event-name> is the same. + * + * <subsystem>:* means all events in that subsystem + * <subsystem>: means the same. + * + * <name> (no ':') means all events in a subsystem with + * the name <name> or any event that matches <name> + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } + + ret = __ftrace_set_clr_event(tr, match, sub, event, set); + + /* Put back the colon to allow this to be called again */ + if (buf) + *(buf - 1) = ':'; + + return ret; +} + +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + struct trace_array *tr = top_trace_array(); + + if (!tr) + return -ENODEV; + + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_set_clr_event); + +/* 128 should be much more than enough */ +#define EVENT_BUF_SIZE 127 + +static ssize_t +ftrace_event_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; + ssize_t read, ret; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + int set = 1; + + if (*parser.buffer == '!') + set = 0; + + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); + if (ret) + goto out_put; + } + + ret = read; + + out_put: + trace_parser_put(&parser); + + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_file *file = v; + struct ftrace_event_call *call; + struct trace_array *tr = m->private; + + (*pos)++; + + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; + /* + * The ftrace subsystem is for showing formats only. + * They can not be enabled or disabled via the event files. + */ + if (call->class && call->class->reg) + return file; + } + + return NULL; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *file; + struct trace_array *tr = m->private; + loff_t l; + + mutex_lock(&event_mutex); + + file = list_entry(&tr->events, struct ftrace_event_file, list); + for (l = 0; l <= *pos; ) { + file = t_next(m, file, &l); + if (!file) + break; + } + return file; +} + +static void * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_file *file = v; + struct trace_array *tr = m->private; + + (*pos)++; + + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return file; + } + + return NULL; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *file; + struct trace_array *tr = m->private; + loff_t l; + + mutex_lock(&event_mutex); + + file = list_entry(&tr->events, struct ftrace_event_file, list); + for (l = 0; l <= *pos; ) { + file = s_next(m, file, &l); + if (!file) + break; + } + return file; +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_event_file *file = v; + struct ftrace_event_call *call = file->event_call; + + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); + seq_printf(m, "%s\n", ftrace_event_name(call)); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&event_mutex); +} + +static ssize_t +event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_file *file; + unsigned long flags; + char buf[4] = "0"; + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + flags = file->flags; + mutex_unlock(&event_mutex); + + if (!file) + return -ENODEV; + + if (flags & FTRACE_EVENT_FL_ENABLED && + !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || + flags & FTRACE_EVENT_FL_SOFT_MODE) + strcat(buf, "*"); + + strcat(buf, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); +} + +static ssize_t +event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_file *file; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + ret = -ENODEV; + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + ret = ftrace_event_enable_disable(file, val); + mutex_unlock(&event_mutex); + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return ret ? ret : cnt; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = dir->tr; + char buf[2]; + int set = 0; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; + if (!ftrace_event_name(call) || !call->class || !call->class->reg) + continue; + + if (system && strcmp(call->class->system, system->name) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); + + /* + * If we have a mixture, no need to look further. + */ + if (set == 3) + break; + } + mutex_unlock(&event_mutex); + + buf[0] = set_to_char[set]; + buf[1] = '\n'; + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + const char *name = NULL; + unsigned long val; + ssize_t ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + if (ret) + goto out; + + ret = cnt; + +out: + *ppos += cnt; + + return ret; +} + +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = event_file_data(m->private); + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); + struct list_head *node = v; + + (*pos)++; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + node = common_head; + break; + + case FORMAT_FIELD_SEPERATOR: + node = head; + break; + + case FORMAT_PRINTFMT: + /* all done */ + return NULL; + } + + node = node->prev; + if (node == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (node == head) + return (void *)FORMAT_PRINTFMT; + else + return node; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = event_file_data(m->private); + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", ftrace_event_name(call)); + seq_printf(m, "ID: %d\n", call->event.type); + seq_puts(m, "format:\n"); + return 0; + + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; + } + + field = list_entry(v, struct ftrace_event_field, link); + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + + return 0; +} + +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + if (!event_file_data(m->private)) + return ERR_PTR(-ENODEV); + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + +static void f_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&event_mutex); +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = file; + + return 0; +} + +static ssize_t +event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int id = (long)event_file_data(filp); + char buf[32]; + int len; + + if (*ppos) + return 0; + + if (unlikely(!id)) + return -ENODEV; + + len = sprintf(buf, "%d\n", id); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_file *file; + struct trace_seq *s; + int r = -ENODEV; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) + print_event_filter(file, s); + mutex_unlock(&event_mutex); + + if (file) + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_file *file; + char *buf; + int err = -ENODEV; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long) buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_array *tr; + int ret; + + if (tracing_is_disabled()) + return -ENODEV; + + /* Make sure the system still exists */ + mutex_lock(&trace_types_lock); + mutex_lock(&event_mutex); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + list_for_each_entry(dir, &tr->systems, list) { + if (dir == inode->i_private) { + /* Don't open systems with no events */ + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; + } + } + } + exit_loop: + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + + if (!system) + return -ENODEV; + + /* Some versions of gcc think dir can be uninitialized here */ + WARN_ON(!dir); + + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); + return -ENODEV; + } + + ret = tracing_open_generic(inode, filp); + if (ret < 0) { + trace_array_put(tr); + put_system(dir); + } + + return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct ftrace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_is_disabled()) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) { + trace_array_put(tr); + return -ENOMEM; + } + + dir->tr = tr; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) { + trace_array_put(tr); + kfree(dir); + return ret; + } + + filp->private_data = dir; + + return 0; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct ftrace_subsystem_dir *dir = file->private_data; + + trace_array_put(dir->tr); + + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); + + return 0; +} + +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + print_subsystem_event_filter(system, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_subsystem_dir *dir = filp->private_data; + char *buf; + int err; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + err = apply_subsystem_event_filter(dir, buf); + free_page((unsigned long) buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); + +static const struct seq_operations show_event_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_event_seq_ops = { + .start = s_start, + .next = s_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_set_event_fops = { + .open = ftrace_event_set_open, + .read = seq_read, + .write = ftrace_event_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + +static const struct file_operations ftrace_enable_fops = { + .open = tracing_open_generic, + .read = event_enable_read, + .write = event_enable_write, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_format_fops = { + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_event_id_fops = { + .read = event_id_read, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_generic, + .read = event_filter_read, + .write = event_filter_write, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = subsystem_open, + .read = subsystem_filter_read, + .write = subsystem_filter_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_system_enable_fops = { + .open = subsystem_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, + .llseek = default_llseek, +}; + +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; + + return ret; +} + +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_event_seq_ops; + + return ftrace_event_open(inode, file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) + return NULL; + + system->ref_count = 1; + + /* Only allocate if dynamic (kprobes and modules) */ + if (!core_kernel_data((unsigned long)name)) { + system->ref_count |= SYSTEM_FL_FREE_NAME; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) + goto out_free; + } else + system->name = name; + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); + + return system; + + out_free: + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); + kfree(system); + return NULL; +} + +static struct dentry * +event_subsystem_dir(struct trace_array *tr, const char *name, + struct ftrace_event_file *file, struct dentry *parent) +{ + struct ftrace_subsystem_dir *dir; + struct event_subsystem *system; + struct dentry *entry; + + /* First see if we did not already create this dir */ + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; + if (strcmp(system->name, name) == 0) { + dir->nr_events++; + file->system = dir; + return dir->entry; + } + } + + /* Now see if the system itself exists. */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + break; + } + /* Reset system variable when not found */ + if (&system->list == &event_subsystems) + system = NULL; + + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; + + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + dir->entry = tracefs_create_dir(name, parent); + if (!dir->entry) { + pr_warn("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; + } + + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; + + entry = tracefs_create_file("filter", 0644, dir->entry, dir, + &ftrace_subsystem_filter_fops); + if (!entry) { + kfree(system->filter); + system->filter = NULL; + pr_warn("Could not create tracefs '%s/filter' entry\n", name); + } + + trace_create_file("enable", 0644, dir->entry, dir, + &ftrace_system_enable_fops); + + list_add(&dir->list, &tr->systems); + + return dir->entry; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warn("No memory to create event subsystem %s\n", name); + return NULL; +} + +static int +event_create_dir(struct dentry *parent, struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; + struct list_head *head; + struct dentry *d_events; + const char *name; + int ret; + + /* + * If the trace point header did not define TRACE_SYSTEM + * then the system would be called "TRACE_SYSTEM". + */ + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { + d_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!d_events) + return -ENOMEM; + } else + d_events = parent; + + name = ftrace_event_name(call); + file->dir = tracefs_create_dir(name, d_events); + if (!file->dir) { + pr_warn("Could not create tracefs '%s' directory\n", name); + return -1; + } + + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("enable", 0644, file->dir, file, + &ftrace_enable_fops); + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg) + trace_create_file("id", 0444, file->dir, + (void *)(long)call->event.type, + &ftrace_event_id_fops); +#endif + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warn("Could not initialize trace point events/%s\n", + name); + return -1; + } + } + trace_create_file("filter", 0644, file->dir, file, + &ftrace_event_filter_fops); + + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + + trace_create_file("format", 0444, file->dir, call, + &ftrace_event_format_fops); + + return 0; +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ + struct ftrace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + if (file->event_call != call) + continue; + + remove_event_file_dir(file); + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + +static void event_remove(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + ftrace_event_enable_disable(file, 0); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + remove_event_from_tracers(call); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + const char *name; + + name = ftrace_event_name(call); + if (WARN_ON(!name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", name); + } + + return ret; +} + +static int +__register_event(struct ftrace_event_call *call, struct module *mod) +{ + int ret; + + ret = event_init(call); + if (ret < 0) + return ret; + + list_add(&call->list, &ftrace_events); + call->mod = mod; + + return 0; +} + +static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +{ + int rlen; + int elen; + + /* Find the length of the enum value as a string */ + elen = snprintf(ptr, 0, "%ld", map->enum_value); + /* Make sure there's enough room to replace the string with the value */ + if (len < elen) + return NULL; + + snprintf(ptr, elen + 1, "%ld", map->enum_value); + + /* Get the rest of the string of ptr */ + rlen = strlen(ptr + len); + memmove(ptr + elen, ptr + len, rlen); + /* Make sure we end the new string */ + ptr[elen + rlen] = 0; + + return ptr + elen; +} + +static void update_event_printk(struct ftrace_event_call *call, + struct trace_enum_map *map) +{ + char *ptr; + int quote = 0; + int len = strlen(map->enum_string); + + for (ptr = call->print_fmt; *ptr; ptr++) { + if (*ptr == '\\') { + ptr++; + /* paranoid */ + if (!*ptr) + break; + continue; + } + if (*ptr == '"') { + quote ^= 1; + continue; + } + if (quote) + continue; + if (isdigit(*ptr)) { + /* skip numbers */ + do { + ptr++; + /* Check for alpha chars like ULL */ + } while (isalnum(*ptr)); + if (!*ptr) + break; + /* + * A number must have some kind of delimiter after + * it, and we can ignore that too. + */ + continue; + } + if (isalpha(*ptr) || *ptr == '_') { + if (strncmp(map->enum_string, ptr, len) == 0 && + !isalnum(ptr[len]) && ptr[len] != '_') { + ptr = enum_replace(ptr, map, len); + /* Hmm, enum string smaller than value */ + if (WARN_ON_ONCE(!ptr)) + return; + /* + * No need to decrement here, as enum_replace() + * returns the pointer to the character passed + * the enum, and two enums can not be placed + * back to back without something in between. + * We can skip that something in between. + */ + continue; + } + skip_more: + do { + ptr++; + } while (isalnum(*ptr) || *ptr == '_'); + if (!*ptr) + break; + /* + * If what comes after this variable is a '.' or + * '->' then we can continue to ignore that string. + */ + if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { + ptr += *ptr == '.' ? 1 : 2; + if (!*ptr) + break; + goto skip_more; + } + /* + * Once again, we can skip the delimiter that came + * after the string. + */ + continue; + } + } +} + +void trace_event_enum_update(struct trace_enum_map **map, int len) +{ + struct ftrace_event_call *call, *p; + const char *last_system = NULL; + int last_i; + int i; + + down_write(&trace_event_sem); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + /* events are usually grouped together with systems */ + if (!last_system || call->class->system != last_system) { + last_i = 0; + last_system = call->class->system; + } + + for (i = last_i; i < len; i++) { + if (call->class->system == map[i]->system) { + /* Save the first system if need be */ + if (!last_i) + last_i = i; + update_event_printk(call, map[i]); + } + } + } + up_write(&trace_event_sem); +} + +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); + list_add(&file->list, &tr->events); + + return file; +} + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; + + return event_create_dir(tr->event_dir, file); +} + +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; + + return 0; +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call); + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&trace_types_lock); + mutex_lock(&event_mutex); + + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call); + + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + return ret; +} + +/* + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + event_remove(call); + trace_destroy_fields(call); + free_event_filter(call->filter); + call->filter = NULL; +} + +static int probe_remove_event_call(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; + +#ifdef CONFIG_PERF_EVENTS + if (call->perf_refcount) + return -EBUSY; +#endif + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + /* + * We can't rely on ftrace_event_enable_disable(enable => 0) + * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * TRACE_REG_UNREGISTER. + */ + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return -EBUSY; + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + __trace_remove_event_call(call); + + return 0; +} + +/* Remove an event_call */ +int trace_remove_event_call(struct ftrace_event_call *call) +{ + int ret; + + mutex_lock(&trace_types_lock); + mutex_lock(&event_mutex); + down_write(&trace_event_sem); + ret = probe_remove_event_call(call); + up_write(&trace_event_sem); + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + + return ret; +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_event_call **call, **start, **end; + + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + for_each_event(call, start, end) { + __register_event(*call, mod); + __add_event_to_tracers(*call); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_event_call *call, *p; + bool clear_trace = false; + + down_write(&trace_event_sem); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) + clear_trace = true; + __trace_remove_event_call(call); + } + } + up_write(&trace_event_sem); + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. + */ + if (clear_trace) + tracing_reset_all_online_cpus(); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&trace_types_lock); + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + + return 0; +} + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 1, /* higher than trace.c module notify */ +}; +#endif /* CONFIG_MODULES */ + +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + ret = __trace_add_new_event(call, tr); + if (ret < 0) + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(call)); + } +} + +struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + const char *name; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = ftrace_event_name(call); + + if (!name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (strcmp(event, name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *_data) +{ + struct event_probe_data *data = _data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + data->file->event_call->class->system, + ftrace_event_name(data->file->event_call)); + + if (data->count == -1) + seq_puts(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", data->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + data->ref++; + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(data->file, 0, 1); + module_put(data->file->event_call->mod); + kfree(data); + } + *pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled || !param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + ret = 0; + goto out; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + module_put(file->event_call->mod); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the tracefs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file); + if (ret < 0) + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(file->event_call)); + } +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (WARN_ON_ONCE(call->mod)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warn("Could not create early event %s\n", + ftrace_event_name(call)); + } +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) + remove_event_file_dir(file); +} + +static void __add_event_to_tracers(struct ftrace_event_call *call) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __trace_add_new_event(call, tr); +} + +extern struct ftrace_event_call *__start_ftrace_events[]; +extern struct ftrace_event_call *__stop_ftrace_events[]; + +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + tracing_selftest_disabled = true; + + return 1; +} +__setup("trace_event=", setup_trace_event); + +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ + struct dentry *d_events; + struct dentry *entry; + + entry = tracefs_create_file("set_event", 0644, parent, + tr, &ftrace_set_event_fops); + if (!entry) { + pr_warn("Could not create tracefs 'set_event' entry\n"); + return -ENOMEM; + } + + d_events = tracefs_create_dir("events", parent); + if (!d_events) { + pr_warn("Could not create tracefs 'events' directory\n"); + return -ENOMEM; + } + + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + + tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ + mutex_lock(&event_mutex); + + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + + down_write(&trace_event_sem); + __trace_remove_event_dirs(tr); + tracefs_remove_recursive(tr->event_dir); + up_write(&trace_event_sem); + + tr->event_dir = NULL; + + mutex_unlock(&event_mutex); + + return 0; +} + +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + return 0; +} + +static __init void +early_enable_events(struct trace_array *tr, bool disable_first) +{ + char *buf = bootup_event_buf; + char *token; + int ret; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); + + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + +static __init int event_trace_enable(void) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_call **iter, *call; + int ret; + + if (!tr) + return -ENODEV; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + + early_enable_events(tr, false); + + trace_printk_start_comm(); + + register_event_cmds(); + + register_trigger_cmds(); + + return 0; +} + +/* + * event_trace_enable() is called from trace_event_init() first to + * initialize events and perhaps start any events that are on the + * command line. Unfortunately, there are some events that will not + * start this early, like the system call tracepoints that need + * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() + * is called before pid 1 starts, and this flag is never set, making + * the syscall tracepoint never get reached, but the event is enabled + * regardless (and not doing anything). + */ +static __init int event_trace_enable_again(void) +{ + struct trace_array *tr; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + early_enable_events(tr, true); + + return 0; +} + +early_initcall(event_trace_enable_again); + +static __init int event_trace_init(void) +{ + struct trace_array *tr; + struct dentry *d_tracer; + struct dentry *entry; + int ret; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + entry = tracefs_create_file("available_events", 0444, d_tracer, + tr, &ftrace_avail_fops); + if (!entry) + pr_warn("Could not create tracefs 'available_events' entry\n"); + + if (trace_define_common_fields()) + pr_warn("tracing: Failed to allocate common fields"); + + ret = early_event_add_tracer(d_tracer, tr); + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = register_module_notifier(&trace_module_nb); + if (ret) + pr_warn("Failed to register trace events module notifier\n"); +#endif + return 0; +} + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); +} + +fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ + struct ftrace_subsystem_dir *dir; + struct ftrace_event_file *file; + struct ftrace_event_call *call; + struct event_subsystem *system; + struct trace_array *tr; + int ret; + + tr = top_trace_array(); + if (!tr) + return; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) + continue; + +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) + continue; +#endif + + pr_info("Testing event %s: ", ftrace_event_name(call)); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (file->flags & FTRACE_EVENT_FL_ENABLED) { + pr_warn("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + ftrace_event_enable_disable(file, 1); + event_test_stuff(); + ftrace_event_enable_disable(file, 0); + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warn("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warn("error disabling system %s\n", + system->name); + continue; + } + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warn("error enabling all events\n"); + return; + } + + event_test_stuff(); + + /* reset sysname */ + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warn("error disabling all events\n"); + return; + } + + pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int cpu; + int pc; + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(&buffer, + TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_buffer_unlock_commit(buffer, event, flags, pc); + + out: + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static __init void event_trace_self_test_with_function(void) +{ + int ret; + ret = register_ftrace_function(&trace_ops); + if (WARN_ON(ret < 0)) { + pr_info("Failed to enable function tracer for event tests\n"); + return; + } + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } + + return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 000000000..52adf02d7 --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,2440 @@ +/* + * trace_events_filter - generic event filtering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/perf_event.h> +#include <linux/slab.h> + +#include "trace.h" +#include "trace_output.h" + +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + +enum filter_op_ids +{ + OP_OR, + OP_AND, + OP_GLOB, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_BAND, + OP_NOT, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +/* Order must be the same as enum filter_op_ids above */ +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_BAND, "&", 6 }, + { OP_NOT, "!", 6 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, + FILT_ERR_ILLEGAL_NOT_OP, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", + "Only 'ip' field is supported for function trace", + "Illegal use of '!'", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +struct pred_stack { + struct filter_pred **preds; + int index; +}; + +/* If not of not match is equal to not of not, then it is a match */ +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + case OP_BAND: \ + match = (*addr & val); \ + break; \ + default: \ + break; \ + } \ + \ + return !!match == !pred->not; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); + + match = cmp ^ pred->not; + + return match; +} + +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ + + cmp = pred->regex.match(*addr, &pred->regex, len); + + match = cmp ^ pred->not; + + return match; +} + +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event) +{ + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, r->len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strnstr(str, r->pattern, len)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + int strlen = len - 1; + + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +enum walk_return { + WALK_PRED_ABORT, + WALK_PRED_PARENT, + WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, + struct filter_pred *pred, + int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, + struct filter_pred *root, + filter_pred_walkcb_t cb, void *data) +{ + struct filter_pred *pred = root; + enum move_type move = MOVE_DOWN; + int done = 0; + + if (!preds) + return -EINVAL; + + do { + int err = 0, ret; + + ret = cb(move, pred, &err, data); + if (ret == WALK_PRED_ABORT) + return err; + if (ret == WALK_PRED_PARENT) + goto get_parent; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + goto get_parent; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + get_parent: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, + &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int match = 0; + int type; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + if (!WARN_ON_ONCE(!pred->fn)) + match = pred->fn(pred, rec); + if (!!match == type) + break; + } + /* If not of not match is equal to not of not, then it is a match */ + return !!match == !op->not; +} + +struct filter_match_preds_data { + struct filter_pred *preds; + int match; + void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_match_preds_data *d = data; + + *err = 0; + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) + return WALK_PRED_DEFAULT; + /* We can treat folded ops as a leaf node */ + d->match = process_ops(d->preds, pred, d->rec); + } else { + if (!WARN_ON_ONCE(!pred->fn)) + d->match = pred->fn(pred, d->rec); + } + + return WALK_PRED_PARENT; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!d->match == (pred->op == OP_OR)) + return WALK_PRED_PARENT; + break; + case MOVE_UP_FROM_RIGHT: + break; + } + + return WALK_PRED_DEFAULT; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct event_filter *filter, void *rec) +{ + struct filter_pred *preds; + struct filter_pred *root; + struct filter_match_preds_data data = { + /* match is currently meaningless */ + .match = -1, + .rec = rec, + }; + int n_preds, ret; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + data.preds = preds = rcu_dereference_sched(filter->preds); + ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); + WARN_ON(ret); + return data.match; +} +EXPORT_SYMBOL_GPL(filter_match_preds); + +static void parse_error(struct filter_parse_state *ps, int err, int pos) +{ + ps->lasterr = err; + ps->lasterr_pos = pos; +} + +static void remove_filter_string(struct event_filter *filter) +{ + if (!filter) + return; + + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; + + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); +} + +static inline struct event_filter *event_filter(struct ftrace_event_file *file) +{ + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + return file->event_call->filter; + else + return file->filter; +} + +/* caller must hold event_mutex */ +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +{ + struct event_filter *filter = event_filter(file); + + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_puts(s, "none\n"); +} + +void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s) +{ + struct event_filter *filter; + + mutex_lock(&event_mutex); + filter = system->filter; + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + mutex_unlock(&event_mutex); +} + +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, + struct filter_pred *src) +{ + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + + *dest = *src; + dest->index = idx; + + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + ((left->op == dest->op && !left->not) || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + ((right->op == dest->op && !right->not) || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); +} + +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + +static void filter_disable(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_FILTERED; + else + file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + +static void __free_filter(struct event_filter *filter) +{ + if (!filter) + return; + + __free_preds(filter); + kfree(filter->filter_string); + kfree(filter); +} + +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + +static struct event_filter *__alloc_filter(void) +{ + struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) + __free_preds(filter); + + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); + + if (!filter->preds) + return -ENOMEM; + + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; + pred->fn = filter_pred_none; + } + + return 0; +} + +static inline void __remove_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + filter_disable(file); + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + remove_filter_string(call->filter); + else + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir) + continue; + __remove_filter(file); + } +} + +static inline void __free_subsystem_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { + __free_filter(call->filter); + call->filter = NULL; + } else { + __free_filter(file->filter); + file->filter = NULL; + } +} + +static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir) + continue; + __free_subsystem_filter(file); + } +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack) +{ + int err; + + if (WARN_ON(filter->n_preds == filter->a_preds)) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + err = filter_set_pred(filter, filter->n_preds, stack, pred); + if (err) + return err; + + filter->n_preds++; + + return 0; +} + +int filter_assign_type(const char *type) +{ + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + + if (strchr(type, '[') && strstr(type, "char")) + return FILTER_STATIC_STRING; + + return FILTER_OTHER; +} + +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int init_pred(struct filter_parse_state *ps, + struct ftrace_event_field *field, + struct filter_pred *pred) + +{ + filter_pred_fn_t fn = filter_pred_none; + unsigned long long val; + int ret; + + pred->offset = field->offset; + + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + + if (is_string_field(field)) { + filter_build_regex(pred); + + if (field->filter_type == FILTER_STATIC_STRING) { + fn = filter_pred_string; + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) + fn = filter_pred_strloc; + else + fn = filter_pred_pchar; + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } + } else { + if (field->is_signed) + ret = kstrtoll(pred->regex.pattern, 0, &val); + else + ret = kstrtoull(pred->regex.pattern, 0, &val); + if (ret) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); + return -EINVAL; + } + pred->val = val; + + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + } + + if (pred->op == OP_NE) + pred->not ^= 1; + + pred->fn = fn; + return 0; +} + +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); + + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + if (!ps->infix.cnt) + return 0; + + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + if (!ps->infix.cnt) + return; + + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } + + return 0; +} + +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; + } + } + + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + list_del(&elt->list); + kfree(elt->operand); + kfree(elt); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int in_string = 0; + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); + continue; + } + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + continue; + } +parse_operand: + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + int op, char *operand1, char *operand2) +{ + struct ftrace_event_field *field; + static struct filter_pred pred; + + memset(&pred, 0, sizeof(pred)); + pred.op = op; + + if (op == OP_AND || op == OP_OR) + return &pred; + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return NULL; + } + + field = trace_find_event_field(call, operand1); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); + return NULL; + } + + strcpy(pred.regex.pattern, operand2); + pred.regex.len = strlen(pred.regex.pattern); + pred.field = field; + return init_pred(ps, field, &pred) ? NULL : &pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + int cnt = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + cnt++; + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + cnt--; + continue; + } + if (elt->op != OP_NOT) + cnt--; + n_normal_preds++; + /* all ops should have operands */ + if (cnt < 0) + break; + } + + if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); + return -EINVAL; + } + + return 0; +} + +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +struct check_pred_data { + int count; + int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct check_pred_data *d = data; + + if (WARN_ON(d->count++ > d->max)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + return WALK_PRED_DEFAULT; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct check_pred_data data = { + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + .max = 3 * filter->n_preds, + .count = 0, + }; + + return walk_pred_tree(filter->preds, root, + check_pred_tree_cb, &data); +} + +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + int *count = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) + (*count)++; + + return WALK_PRED_DEFAULT; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + int count = 0, ret; + + ret = walk_pred_tree(preds, root, count_leafs_cb, &count); + WARN_ON(ret); + return count; +} + +struct fold_pred_data { + struct filter_pred *root; + int count; + int children; +}; + +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct fold_pred_data *d = data; + struct filter_pred *root = d->root; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (pred->left != FILTER_PRED_INVALID) + return WALK_PRED_DEFAULT; + + if (WARN_ON(d->count == d->children)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + + pred->index &= ~FILTER_PRED_FOLD; + root->ops[d->count++] = pred->index; + return WALK_PRED_DEFAULT; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct fold_pred_data data = { + .root = root, + .count = 0, + }; + int children; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + data.children = children; + return walk_pred_tree(preds, root, fold_pred_cb, &data); +} + +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_pred *preds = data; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (!(pred->index & FILTER_PRED_FOLD)) + return WALK_PRED_DEFAULT; + + *err = fold_pred(preds, pred); + if (*err) + return WALK_PRED_ABORT; + + /* eveyrhing below is folded, continue with parent */ + return WALK_PRED_PARENT; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, + filter->preds); +} + +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, + struct filter_parse_state *ps, + bool dry_run) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct filter_pred *root; + struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ + int err; + int n_preds = 0; + + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + err = check_preds(ps); + if (err) + return err; + + if (!dry_run) { + err = __alloc_pred_stack(&stack, n_preds); + if (err) + return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; + } + + n_preds = 0; + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + err = -EINVAL; + goto fail; + } + continue; + } + + if (elt->op == OP_NOT) { + if (!n_preds || operand1 || operand2) { + parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); + err = -EINVAL; + goto fail; + } + if (!dry_run) + filter->preds[n_preds - 1].not ^= 1; + continue; + } + + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + err = -ENOSPC; + goto fail; + } + + pred = create_pred(ps, call, elt->op, operand1, operand2); + if (!pred) { + err = -EINVAL; + goto fail; + } + + if (!dry_run) { + err = filter_add_pred(ps, filter, pred, &stack); + if (err) + goto fail; + } + + operand1 = operand2 = NULL; + } + + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; +} + +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, + struct event_filter *filter) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + rcu_assign_pointer(call->filter, filter); + else + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + RCU_INIT_POINTER(call->filter, NULL); + else + RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + return true; + + if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && + (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) + return true; + + return false; +} + +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +static int replace_system_preds(struct ftrace_subsystem_dir *dir, + struct trace_array *tr, + struct filter_parse_state *ps, + char *filter_string) +{ + struct ftrace_event_file *file; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); + bool fail = true; + int err; + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir) + continue; + + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(file->event_call, NULL, ps, true); + if (err) + event_set_no_set_filter_flag(file); + else + event_clear_no_set_filter_flag(file); + } + + list_for_each_entry(file, &tr->events, list) { + struct event_filter *filter; + + if (file->system != dir) + continue; + + if (event_no_set_filter_flag(file)) + continue; + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; + + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); + if (err) + goto fail_mem; + + err = replace_preds(file->event_call, filter, ps, false); + if (err) { + filter_disable(file); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else + event_set_filtered_flag(file); + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = event_filter(file); + event_set_filter(file, filter_item->filter); + filter_item->filter = filter; + + fail = false; + } + + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; +} + +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(call, filter_str, set_str, filterp); +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct ftrace_subsystem_dir *dir, + struct trace_array *tr, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(dir, tr, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/* caller must hold event_mutex */ +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) +{ + struct ftrace_event_call *call = file->event_call; + struct event_filter *filter; + int err; + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable(file); + filter = event_filter(file); + + if (!filter) + return 0; + + event_clear_filter(file); + + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_filter(filter); + + return 0; + } + + err = create_filter(call, filter_string, true, &filter); + + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + if (filter) { + struct event_filter *tmp; + + tmp = event_filter(file); + if (!err) + event_set_filtered_flag(file); + else + filter_disable(file); + + event_set_filter(file, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } + } + + return err; +} + +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, + char *filter_string) +{ + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + struct event_filter *filter; + int err = 0; + + mutex_lock(&event_mutex); + + /* Make sure the system still has events */ + if (!dir->nr_events) { + err = -ENODEV; + goto out_unlock; + } + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(dir, tr); + remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(dir, tr); + __free_filter(filter); + goto out_unlock; + } + + err = create_system_filter(dir, tr, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_PERF_EVENTS + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_filter(filter); +} + +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret = -EINVAL; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + + call = event->tp_event; + + err = -EINVAL; + if (!call) + goto out_unlock; + + err = -EEXIST; + if (event->filter) + goto out_unlock; + + err = create_filter(call, filter_str, false, &filter); + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); + else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) + __free_filter(filter); + +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#endif /* CONFIG_PERF_EVENTS */ + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct ftrace_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + char *fields = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) { + struct ftrace_event_field *field = pred->field; + + if (!field) { + WARN(1, "all leafs should have field defined"); + return WALK_PRED_DEFAULT; + } + if (!strchr(fields, *field->name)) + return WALK_PRED_DEFAULT; + + WARN_ON(!pred->fn); + pred->fn = test_pred_visited_fn; + } + return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + __free_filter(filter); + break; + } + + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); + if (*d->not_visited) + walk_pred_tree(filter->preds, filter->root, + test_walk_pred_cb, + d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + preempt_enable(); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000..bfd4dba0d --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 000000000..8712df9de --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + synchronize_sched(); /* make sure current triggers exit before free */ + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (!rec) { + data->ops->func(data); + continue; + } + filter = rcu_dereference_sched(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (data->cmd_ops->post_trigger) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->func(data); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type & tt) + data->ops->func(data); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct ftrace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) + return NULL; + + return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data->ops, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ + char *command, *next = buff; + struct event_command *p; + int ret = -EINVAL; + + command = strsep(&next, ": \t"); + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long)buf); + return -EFAULT; + } + buf[cnt] = '\0'; + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + free_page((unsigned long)buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long)buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_puts(m, name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_putc(m, '\n'); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data; + list_for_each_entry_rcu(data, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + if (data->ops->free) + data->ops->free(data->ops, data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->filter || data->cmd_ops->post_trigger) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + char *trigger = NULL; + char *number; + int ret; + + /* separate the trigger from the filter (t:n [if filter]) */ + if (param && isdigit(param[0])) + trigger = strsep(¶m, " \t"); + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + ret = 0; + out: + return ret; + + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->event_call, filter_str, false, &filter); + if (ret) + goto out; + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .func = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .func = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .func = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .func = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + int ret = register_trigger(glob, ops, data, file); + + if (ret > 0 && tracing_alloc_snapshot() != 0) { + unregister_trigger(glob, ops, data, file); + ret = 0; + } + + return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .func = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .func = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .func = event_trigger_callback, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + * stacktrace_trigger() + * event_triggers_post_call() + * ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .func = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .func = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .post_trigger = true, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct enable_trigger_data { + struct ftrace_event_file *file; + bool enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->file->event_call->class->system, + ftrace_event_name(enable_data->file->event_call)); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_putc(m, '\n'); + + return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + module_put(enable_data->file->event_call->mod); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct ftrace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + struct trace_array *tr = file->tr; + const char *system; + const char *event; + char *trigger; + char *number; + bool enable; + int ret; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (s:e:n [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + system = strsep(&trigger, ":"); + if (!trigger) + return -EINVAL; + + event = strsep(&trigger, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) { + kfree(trigger_data); + goto out; + } + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + enable_data->enable = enable; + enable_data->file = event_enable_file; + trigger_data->private_data = enable_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(event_enable_file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + return ret; + + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + module_put(event_enable_file->event_call->mod); + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + kfree(enable_data); + goto out; +} + +static int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +static void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct enable_trigger_data *enable_data; + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + enable_data = data->private_data; + if (enable_data && + (enable_data->file == test_enable_data->file)) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + + return 0; +} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c new file mode 100644 index 000000000..174a6a711 --- /dev/null +++ b/kernel/trace/trace_export.c @@ -0,0 +1,195 @@ +/* + * trace_export.c - export basic ftrace utilities to user space + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/stringify.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "trace_output.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace + +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) + +#undef __field +#define __field(type, item) type item; + +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + +#include "trace_entries.h" + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ + if (ret) \ + return ret; + +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), filter_type); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + do { \ + char *type_str = #type"["__stringify(len)"]"; \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, type_str, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ + if (ret) \ + return ret; \ + } while (0); + +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), filter_type); \ + if (ret) \ + return ret; + +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), filter_type);\ + if (ret) \ + return ret; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +static int __init \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ +{ \ + struct struct_name field; \ + int ret; \ + int filter_type = filter; \ + \ + tstruct; \ + \ + return ret; \ +} + +#include "trace_entries.h" + +#undef __entry +#define __entry REC + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef F_printk +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ + \ +struct ftrace_event_class __refdata event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .define_fields = ftrace_define_fields_##call, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ +}; \ + \ +struct ftrace_event_call __used event_##call = { \ + .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ + .print_fmt = print, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ +}; \ +struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 000000000..fcd41a166 --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,689 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/ring_buffer.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "trace.h" + +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct tracer_flags func_flags; + +/* Our option */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + /* Currently only the non stack verision is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + + tr->ops = ops; + ops->private = tr; + return 0; +} + + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + int ret; + + /* + * The top level array uses the "global_ops", and the files are + * created on boot up. + */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + kfree(tr->ops); + tr->ops = NULL; +} + +static int function_trace_init(struct trace_array *tr) +{ + ftrace_func_t func; + + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) + return -ENOMEM; + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); + + tr->trace_buffer.cpu = get_cpu(); + put_cpu(); + + tracing_start_cmdline_record(); + tracing_start_function_trace(tr); + return 0; +} + +static void function_trace_reset(struct trace_array *tr) +{ + tracing_stop_function_trace(tr); + tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); +} + +static void function_trace_start(struct trace_array *tr) +{ + tracing_reset_online_cpus(&tr->trace_buffer); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + int bit; + int cpu; + int pc; + + if (unlikely(!tr->function_enabled)) + return; + + pc = preempt_count(); + preempt_disable_notrace(); + + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); + trace_function(tr, ip, parent_ip, flags, pc); + } + trace_clear_recursion(bit); + + out: + preempt_enable_notrace(); +} + +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, ip, parent_ip, flags, pc); + /* + * skip over 5 funcs: + * __ftrace_trace_stack, + * __trace_stack, + * function_stack_trace_call + * ftrace_list_func + * ftrace_call + */ + __trace_stack(tr, flags, 5, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct tracer_opt func_opts[] = { +#ifdef CONFIG_STACKTRACE + { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags func_flags = { + .val = 0, /* By default: all flags disabled */ + .opts = func_opts +}; + +static void tracing_start_function_trace(struct trace_array *tr) +{ + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; +} + +static void tracing_stop_function_trace(struct trace_array *tr) +{ + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); +} + +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + switch (bit) { + case TRACE_FUNC_OPT_STACK: + /* do nothing if already set */ + if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) + break; + + unregister_ftrace_function(tr->ops); + + if (set) { + tr->ops->func = function_stack_trace_call; + register_ftrace_function(tr->ops); + } else { + tr->ops->func = function_trace_call; + register_ftrace_function(tr->ops); + } + + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct tracer function_trace __tracer_data = +{ + .name = "function", + .init = function_trace_init, + .reset = function_trace_reset, + .start = function_trace_start, + .flags = &func_flags, + .set_flag = func_set_flag, + .allow_instances = true, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +#ifdef CONFIG_DYNAMIC_FTRACE +static void update_traceon_count(void **data, bool on) +{ + long *count = (long *)data; + long old_count = *count; + + /* + * Tracing gets disabled (or enabled) once per count. + * This function can be called at the same time on multiple CPUs. + * It is fine if both disable (or enable) tracing, as disabling + * (or enabling) the second time doesn't do anything as the + * state of the tracer is already disabled (or enabled). + * What needs to be synchronized in this case is that the count + * only gets decremented once, even if the tracer is disabled + * (or enabled) twice, as the second one is really a nop. + * + * The memory barriers guarantee that we only decrement the + * counter once. First the count is read to a local variable + * and a read barrier is used to make sure that it is loaded + * before checking if the tracer is in the state we want. + * If the tracer is not in the state we want, then the count + * is guaranteed to be the old count. + * + * Next the tracer is set to the state we want (disabled or enabled) + * then a write memory barrier is used to make sure that + * the new state is visible before changing the counter by + * one minus the old counter. This guarantees that another CPU + * executing this code will see the new state before seeing + * the new counter value, and would not do anything if the new + * counter is seen. + * + * Note, there is no synchronization between this and a user + * setting the tracing_on file. But we currently don't care + * about that. + */ + if (!old_count) + return; + + /* Make sure we see count before checking tracing state */ + smp_rmb(); + + if (on == !!tracing_is_on()) + return; + + if (on) + tracing_on(); + else + tracing_off(); + + /* unlimited? */ + if (old_count == -1) + return; + + /* Make sure tracing state is visible before updating count */ + smp_wmb(); + + *count = old_count - 1; +} + +static void +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + update_traceon_count(data, 1); +} + +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + update_traceon_count(data, 0); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +/* + * Skip 4: + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_list_func() + * ftrace_call() + */ +#define STACK_SKIP 4 + +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + long *count = (long *)data; + long old_count; + long new_count; + + /* + * Stack traces should only execute the number of times the + * user specified in the counter. + */ + do { + + if (!tracing_is_on()) + return; + + old_count = *count; + + if (!old_count) + return; + + /* unlimited? */ + if (old_count == -1) { + trace_dump_stack(STACK_SKIP); + return; + } + + new_count = old_count - 1; + new_count = cmpxchg(count, old_count, new_count); + if (new_count == old_count) + trace_dump_stack(STACK_SKIP); + + } while (new_count != old_count); +} + +static int update_count(void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return 0; + + if (*count != -1) + (*count)--; + + return 1; +} + +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + +static int +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:%s", (void *)ip, name); + + if (count == -1) + seq_puts(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static int +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceon", m, ip, data); +} + +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, data); +} + +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, data); +} + +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; + +static int +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) +{ + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + return ret < 0 ? ret : 0; +} + +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + +static struct ftrace_func_command ftrace_traceon_cmd = { + .name = "traceon", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_traceoff_cmd = { + .name = "traceoff", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + +static int __init init_func_cmd_traceon(void) +{ + int ret; + + ret = register_ftrace_command(&ftrace_traceoff_cmd); + if (ret) + return ret; + + ret = register_ftrace_command(&ftrace_traceon_cmd); + if (ret) + goto out_free_traceoff; + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + + return 0; + + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + + return ret; +} +#else +static inline int init_func_cmd_traceon(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static __init int init_function_trace(void) +{ + init_func_cmd_traceon(); + return register_tracer(&function_trace); +} +core_initcall(init_function_trace); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 000000000..a51e79688 --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,1470 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "trace.h" +#include "trace_output.h" + +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + +struct fgraph_cpu_data { + pid_t last_pid; + int depth; + int depth_irq; + int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; +}; + +struct fgraph_data { + struct fgraph_cpu_data __percpu *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; +}; + +#define TRACE_GRAPH_INDENT 2 + +static unsigned int max_depth; + +static struct tracer_opt trace_opts[] = { + /* Display overruns? (for self-debug purpose) */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + /* Display duration of execution */ + { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, + /* Display absolute time of an entry */ + { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + /* Don't display overruns, proc, or tail by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, + .opts = trace_opts +}; + +static struct trace_array *graph_array; + +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, +}; + +static void +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags); + +/* Add a function return address to the trace stack on thread info.*/ +int +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, + unsigned long frame_pointer) +{ + unsigned long long calltime; + int index; + + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + + if (!current->ret_stack) + return -EBUSY; + + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in tracefs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + + calltime = trace_clock_local(); + + index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; + current->ret_stack[index].fp = frame_pointer; + *depth = current->curr_ret_stack; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) +{ + int index; + + index = current->curr_ret_stack; + + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %ps return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret, frame_pointer); + trace.rettime = trace_clock_local(); + barrier(); + current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } + + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return 0; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + return 1; +} + +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) + return 0; + + return in_irq(); +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (!ftrace_trace_task(current)) + return 0; + + /* trace it when it is-nested-in or is a function enabled. */ + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || (trace->depth < 0) || + (max_depth && trace->depth >= max_depth)) + return 0; + + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + +void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + +static int graph_trace_init(struct trace_array *tr) +{ + int ret; + + set_graph_array(tr); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; +} + +static void graph_trace_reset(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(); +} + +static int graph_trace_update_thresh(struct trace_array *tr) +{ + graph_trace_reset(tr); + return graph_trace_init(tr); +} + +static int max_bytes_for_cpu; + +static void print_graph_cpu(struct trace_seq *s, int cpu) +{ + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); +} + +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static void print_graph_proc(struct trace_seq *s, pid_t pid) +{ + char comm[TASK_COMM_LEN]; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + int spaces = 0; + int len; + int i; + + trace_find_cmdline(pid, comm); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%s-%s", comm, pid_str); + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) + trace_seq_putc(s, ' '); +} + + +static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + trace_seq_putc(s, ' '); + trace_print_lat_fmt(s, entry); +} + +/* If the pid changed since the last trace, output this event */ +static void +verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) +{ + pid_t prev_pid; + pid_t *last_pid; + + if (!data) + return; + + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + + if (*last_pid == pid) + return; + + prev_pid = *last_pid; + *last_pid = pid; + + if (prev_pid == -1) + return; +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + trace_seq_puts(s, " ------------------------------------------\n"); + print_graph_cpu(s, cpu); + print_graph_proc(s, prev_pid); + trace_seq_puts(s, " => "); + print_graph_proc(s, pid); + trace_seq_puts(s, "\n ------------------------------------------\n\n"); +} + +static struct ftrace_graph_ret_entry * +get_return_for_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { + + ring_iter = trace_buffer_iter(iter, iter->cpu); + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, + NULL, NULL); + event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, + NULL, NULL); + } + + if (!event) + return NULL; + + next = ring_buffer_event_data(event); + + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; + } + } + + if (next->ent.type != TRACE_GRAPH_RET) + return NULL; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return NULL; + + /* this is a leaf, now advance the iterator */ + if (ring_iter) + ring_buffer_read(ring_iter, NULL); + + return next; +} + +static void print_graph_abs_time(u64 t, struct trace_seq *s) +{ + unsigned long usecs_rem; + + usecs_rem = do_div(t, NSEC_PER_SEC); + usecs_rem /= 1000; + + trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); +} + +static void +print_graph_irq(struct trace_iterator *iter, unsigned long addr, + enum trace_type type, int cpu, pid_t pid, u32 flags) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + print_graph_proc(s, pid); + trace_seq_puts(s, " | "); + } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); + } + + /* No overhead */ + print_graph_duration(0, s, flags | FLAGS_FILL_START); + + if (type == TRACE_GRAPH_ENT) + trace_seq_puts(s, "==========>"); + else + trace_seq_puts(s, "<=========="); + + print_graph_duration(0, s, flags | FLAGS_FILL_END); + trace_seq_putc(s, '\n'); +} + +void +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ + char usecs_str[21]; + char nsecs_str[5]; + int len; + int i; + + sprintf(usecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + trace_seq_printf(s, "%s", usecs_str); + + len = strlen(usecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); + trace_seq_printf(s, ".%s", nsecs_str); + len += strlen(nsecs_str); + } + + trace_seq_puts(s, " us "); + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) + trace_seq_putc(s, ' '); +} + +static void +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags) +{ + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + /* No real adata, just filling the column with spaces */ + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: + trace_seq_puts(s, " | "); + return; + case FLAGS_FILL_START: + trace_seq_puts(s, " "); + return; + case FLAGS_FILL_END: + trace_seq_puts(s, " |"); + return; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) + trace_seq_printf(s, "%c ", trace_find_mark(duration)); + else + trace_seq_puts(s, " "); + + trace_print_graph_duration(duration, s); + trace_seq_puts(s, "| "); +} + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret_entry *ret_entry, + struct trace_seq *s, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct ftrace_graph_ret *graph_ret; + struct ftrace_graph_ent *call; + unsigned long long duration; + int i; + + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. Since + * this is a leaf function, keep the comments + * equal to this depth. + */ + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; + } + + /* Overhead and duration */ + print_graph_duration(duration, s, flags); + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%ps();\n", (void *)call->func); + + return trace_handle_return(s); +} + +static enum print_line_t +print_graph_entry_nested(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, int cpu, u32 flags) +{ + struct ftrace_graph_ent *call = &entry->graph_ent; + struct fgraph_data *data = iter->private; + int i; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; + } + + /* No time */ + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%ps() {\n", (void *)call->func); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + /* + * we already consumed the current entry to check the next one + * and see if this is a leaf. + */ + return TRACE_TYPE_NO_CONSUME; +} + +static void +print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, + int type, unsigned long addr, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct trace_entry *ent = iter->ent; + int cpu = iter->cpu; + + /* Pid */ + verif_pid(s, ent->pid, cpu, data); + + if (type) + /* Interrupt */ + print_graph_irq(iter, addr, type, cpu, ent->pid, flags); + + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + print_graph_proc(s, ent->pid); + trace_seq_puts(s, " | "); + } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); + + return; +} + +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just entered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; + + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + + print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); + + leaf_ret = get_return_for_leaf(iter, field); + if (leaf_ret) + ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + else + ret = print_graph_entry_nested(iter, field, s, cpu, flags); + + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter, + u32 flags) +{ + unsigned long long duration = trace->rettime - trace->calltime; + struct fgraph_data *data = iter->private; + pid_t pid = ent->pid; + int cpu = iter->cpu; + int func_match = 1; + int i; + + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. This is the + * return from a function, we now want the comments + * to display at the same level of the bracket. + */ + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } + } + + print_graph_prologue(iter, s, 0, 0, flags); + + /* Overhead and duration */ + print_graph_duration(duration, s, flags); + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + + /* Overrun */ + if (flags & TRACE_GRAPH_PRINT_OVERRUN) + trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); + + print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); + + return trace_handle_return(s); +} + +static enum print_line_t +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter, u32 flags) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct fgraph_data *data = iter->private; + struct trace_event *event; + int depth = 0; + int ret; + int i; + + if (data) + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; + + print_graph_prologue(iter, s, 0, 0, flags); + + /* No time */ + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + + /* Indentation */ + if (depth > 0) + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + /* The comment */ + trace_seq_puts(s, "/* "); + + switch (iter->ent->type) { + case TRACE_BPRINT: + ret = trace_print_bprintk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_PRINT: + ret = trace_print_printk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + default: + event = ftrace_find_event(ent->type); + if (!event) + return TRACE_TYPE_UNHANDLED; + + ret = event->funcs->trace(iter, sym_flags, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + if (trace_seq_has_overflowed(s)) + goto out; + + /* Strip ending newline */ + if (s->buffer[s->seq.len - 1] == '\n') { + s->buffer[s->seq.len - 1] = '\0'; + s->seq.len--; + } + + trace_seq_puts(s, " */\n"); + out: + return trace_handle_return(s); +} + + +enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter, flags); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry saved; + trace_assign_type(field, entry); + saved = *field; + return print_graph_entry(&saved, s, iter, flags); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry, iter, flags); + } + case TRACE_STACK: + case TRACE_FN: + /* dont trace stack and functions as comments */ + return TRACE_TYPE_UNHANDLED; + + default: + return print_graph_comment(s, entry, iter, flags); + } + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return print_graph_function_flags(iter, tracer_flags.val); +} + +static enum print_line_t +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return print_graph_function(iter); +} + +static void print_lat_header(struct seq_file *s, u32 flags) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (flags & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (flags & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); +} + +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s, flags); + + /* 1st line */ + seq_putc(s, '#'); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_puts(s, " CPU"); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_puts(s, " TASK/PID "); + if (lat) + seq_puts(s, "||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_puts(s, " DURATION "); + seq_puts(s, " FUNCTION CALLS\n"); + + /* 2nd line */ + seq_putc(s, '#'); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_puts(s, " | | "); + if (lat) + seq_puts(s, "||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_puts(s, " | | "); + seq_puts(s, " | | | |\n"); +} + +static void print_graph_headers(struct seq_file *s) +{ + print_graph_headers_flags(s, tracer_flags.val); +} + +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + } + + __print_graph_headers_flags(s, flags); +} + +void graph_trace_open(struct trace_iterator *iter) +{ + /* pid and depth on the last trace processed */ + struct fgraph_data *data; + gfp_t gfpflags; + int cpu; + + iter->private = NULL; + + /* We can be called in atomic context via ftrace_dump() */ + gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + + data = kzalloc(sizeof(*data), gfpflags); + if (!data) + goto out_err; + + data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + *pid = -1; + *depth = 0; + *ignore = 0; + *depth_irq = -1; + } + + iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); +} + +void graph_trace_close(struct trace_iterator *iter) +{ + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } +} + +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + +static struct trace_event graph_trace_entry_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &graph_functions, +}; + +static struct trace_event graph_trace_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &graph_functions +}; + +static struct tracer graph_trace __tracer_data = { + .name = "function_graph", + .update_thresh = graph_trace_update_thresh, + .open = graph_trace_open, + .pipe_open = graph_trace_open, + .close = graph_trace_close, + .pipe_close = graph_trace_close, + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, + .set_flag = func_graph_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function_graph, +#endif +}; + + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_tracefs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_tracefs); + +static __init int init_graph_trace(void) +{ + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + + if (!register_ftrace_event(&graph_trace_entry_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + if (!register_ftrace_event(&graph_trace_ret_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + return register_tracer(&graph_trace); +} + +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 000000000..8523ea345 --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,764 @@ +/* + * trace irqs off critical timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_RAW_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +static int save_flags; +static bool function_enabled; + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph); +static int start_irqsoff_tracer(struct trace_array *tr, int graph); + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesn't + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FUNCTION_TRACER +/* + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) +{ + long disabled; + int cpu; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return 0; + + local_save_flags(*flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(*flags)) + return 0; + + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + disabled = atomic_inc_return(&(*data)->disabled); + + if (likely(disabled == 1)) + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); + + atomic_dec(&data->disabled); +} +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + int cpu; + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_irqsoff_tracer(irqsoff_trace, !set); + + for_each_possible_cpu(cpu) + per_cpu(tracing_cpu, cpu) = 0; + + tr->max_latency = 0; + tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); + + return start_irqsoff_tracer(irqsoff_trace, set); +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int ret; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + return ret; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); +} + +static void irqsoff_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); + +} + +static void irqsoff_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} + +#else +#define __trace_function trace_function + +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } +static void irqsoff_trace_open(struct trace_iterator *iter) { } +static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(struct trace_array *tr, cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tr->max_latency) + return 0; + } + return 1; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + cycle_t T0, T1, delta; + unsigned long flags; + int pc; + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + pc = preempt_count(); + + if (!report_latency(tr, delta)) + goto out; + + raw_spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(tr, delta)) + goto out_unlock; + + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + data->critical_end = parent_ip; + + if (likely(!is_tracing_stopped())) { + tr->max_latency = delta; + update_max_tr_single(tr, current, cpu); + } + + max_sequence++; + +out_unlock: + raw_spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); +} + +static inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!tracer_enabled || !tracing_is_enabled()) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + + local_save_flags(flags); + + __trace_function(tr, ip, parent_ip, flags, preempt_count()); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled || !tracing_is_enabled()) + return; + + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + + if (unlikely(!data) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + local_save_flags(flags); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(start_critical_timings); + +void stop_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(stop_critical_timings); + +#ifdef CONFIG_IRQSOFF_TRACER +#ifdef CONFIG_PROVE_LOCKING +void time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +void time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void trace_hardirqs_on(void) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + if (preempt_trace() && !irq_trace()) + stop_critical_timing(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + if (preempt_trace() && !irq_trace()) + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ + +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&irqsoff_graph_return, + &irqsoff_graph_entry); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static void irqsoff_function_set(struct trace_array *tr, int set) +{ + if (set) + register_irqsoff_function(tr, is_graph(), 1); + else + unregister_irqsoff_function(tr, is_graph()); +} + +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (mask & TRACE_ITER_FUNCTION) + irqsoff_function_set(tr, set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_irqsoff_function(tr, graph); +} + +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) +{ + if (irqsoff_busy) + return -EBUSY; + + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + + tr->max_latency = 0; + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + tracing_reset_online_cpus(&tr->trace_buffer); + + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) + printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + + stop_irqsoff_tracer(tr, is_graph()); + + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + + irqsoff_busy = false; +} + +static void irqsoff_tracer_start(struct trace_array *tr) +{ + tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +#ifdef CONFIG_IRQSOFF_TRACER +static int irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + return __irqsoff_tracer_init(tr); +} +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static int preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + return __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static int preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + return __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif + +__init static int init_irqsoff_tracer(void) +{ + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); + + return 0; +} +core_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 000000000..3ccf5c2c1 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,140 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + + while (trace_find_next_entry_inc(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (!skip_lines) { + print_trace_line(&iter); + trace_printk_seq(&iter.seq); + } else { + skip_lines--; + } + + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + for_each_tracing_cpu(cpu) { + if (iter.buffer_iter[cpu]) { + ring_buffer_read_finish(iter.buffer_iter[cpu]); + iter.buffer_iter[cpu] = NULL; + } + } +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = RING_BUFFER_ALL_CPUS; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 000000000..d0ce590f0 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1495 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> + +#include "trace_probe.h" + +#define KPROBE_EVENT_SYSTEM "kprobes" + +/** + * Kprobe event core functions + */ +struct trace_kprobe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + const char *symbol; /* symbol name */ + struct trace_probe tp; +}; + +#define SIZEOF_TRACE_KPROBE(n) \ + (offsetof(struct trace_kprobe, tp.args) + \ + (sizeof(struct probe_arg) * (n))) + + +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) +{ + return tk->rp.handler != NULL; +} + +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) +{ + return tk->symbol ? tk->symbol : "unknown"; +} + +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +{ + return tk->rp.kp.offset; +} + +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) +{ + return !!(kprobe_gone(&tk->rp.kp)); +} + +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) +{ + int len = strlen(mod->name); + const char *name = trace_kprobe_symbol(tk); + return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} + +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +{ + return !!strchr(trace_kprobe_symbol(tk), ':'); +} + +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; +} + +void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; +} + +/* + * Kprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); + +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); + +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); + } +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); + +/* Return the length of string -- including null terminal byte */ +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); + +#define DEFINE_FETCH_symbol(type) \ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); + +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8 NULL +#define fetch_file_offset_u16 NULL +#define fetch_file_offset_u32 NULL +#define fetch_file_offset_u64 NULL +#define fetch_file_offset_string NULL +#define fetch_file_offset_string_size NULL + +/* Fetch type information table */ +static const struct fetch_type kprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_kprobe *alloc_trace_kprobe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, bool is_return) +{ + struct trace_kprobe *tk; + int ret = -ENOMEM; + + tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + if (!tk) + return ERR_PTR(ret); + + if (symbol) { + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) + goto error; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; + } else + tk->rp.kp.addr = addr; + + if (is_return) + tk->rp.handler = kretprobe_dispatcher; + else + tk->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event || !is_good_name(event)) { + ret = -EINVAL; + goto error; + } + + tk->tp.call.class = &tk->tp.class; + tk->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tk->tp.call.name) + goto error; + + if (!group || !is_good_name(group)) { + ret = -EINVAL; + goto error; + } + + tk->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tk->tp.class.system) + goto error; + + INIT_LIST_HEAD(&tk->list); + INIT_LIST_HEAD(&tk->tp.files); + return tk; +error: + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); + return ERR_PTR(ret); +} + +static void free_trace_kprobe(struct trace_kprobe *tk) +{ + int i; + + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_free_probe_arg(&tk->tp.args[i]); + + kfree(tk->tp.call.class->system); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); +} + +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) +{ + struct trace_kprobe *tk; + + list_for_each_entry(tk, &probe_list, list) + if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && + strcmp(tk->tp.call.class->system, group) == 0) + return tk; + return NULL; +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +{ + int ret = 0; + + if (file) { + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + ret = -ENOMEM; + goto out; + } + + link->file = file; + list_add_tail_rcu(&link->list, &tk->tp.files); + + tk->tp.flags |= TP_FLAG_TRACE; + } else + tk->tp.flags |= TP_FLAG_PROFILE; + + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); + else + ret = enable_kprobe(&tk->rp.kp); + } + out: + return ret; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +{ + struct event_file_link *link = NULL; + int wait = 0; + int ret = 0; + + if (file) { + link = find_event_file_link(&tk->tp, file); + if (!link) { + ret = -EINVAL; + goto out; + } + + list_del_rcu(&link->list); + wait = 1; + if (!list_empty(&tk->tp.files)) + goto out; + + tk->tp.flags &= ~TP_FLAG_TRACE; + } else + tk->tp.flags &= ~TP_FLAG_PROFILE; + + if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); + else + disable_kprobe(&tk->rp.kp); + wait = 1; + } + out: + if (wait) { + /* + * Synchronize with kprobe_trace_func/kretprobe_trace_func + * to ensure disabled (all running handlers are finished). + * This is not only for kfree(), but also the caller, + * trace_remove_event_call() supposes it for releasing + * event_call related objects, which will be accessed in + * the kprobe_trace_func/kretprobe_trace_func. + */ + synchronize_sched(); + kfree(link); /* Ignored if link == NULL */ + } + + return ret; +} + +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_kprobe(struct trace_kprobe *tk) +{ + int i, ret; + + if (trace_probe_is_registered(&tk->tp)) + return -EINVAL; + + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_update_arg(&tk->tp.args[i]); + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + else + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; + + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); + else + ret = register_kprobe(&tk->rp.kp); + + if (ret == 0) + tk->tp.flags |= TP_FLAG_REGISTERED; + else { + pr_warning("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { + pr_warning("This probe might be able to register after" + "target module is loaded. Continue.\n"); + ret = 0; + } else if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tk->rp.kp.addr); + ret = -EINVAL; + } + } + + return ret; +} + +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_kprobe(struct trace_kprobe *tk) +{ + if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); + else + unregister_kprobe(&tk->rp.kp); + tk->tp.flags &= ~TP_FLAG_REGISTERED; + /* Cleanup kprobe for reuse */ + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; + } +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static int unregister_trace_kprobe(struct trace_kprobe *tk) +{ + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tk->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_kprobe_event(tk)) + return -EBUSY; + + __unregister_trace_kprobe(tk); + list_del(&tk->list); + + return 0; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_kprobe(struct trace_kprobe *tk) +{ + struct trace_kprobe *old_tk; + int ret; + + mutex_lock(&probe_lock); + + /* Delete old (same name) event if exist */ + old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + tk->tp.call.class->system); + if (old_tk) { + ret = unregister_trace_kprobe(old_tk); + if (ret < 0) + goto end; + free_trace_kprobe(old_tk); + } + + /* Register new event */ + ret = register_kprobe_event(tk); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret < 0) + unregister_kprobe_event(tk); + else + list_add_tail(&tk->list, &probe_list); + +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Module notifier call back, checking event on the module */ +static int trace_kprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct trace_kprobe *tk; + int ret; + + if (val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + /* Update probes on coming module */ + mutex_lock(&probe_lock); + list_for_each_entry(tk, &probe_list, list) { + if (trace_kprobe_within_module(tk, mod)) { + /* Don't need to check busy - this should have gone. */ + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); + if (ret) + pr_warning("Failed to re-register probe %s on" + "%s: %d\n", + ftrace_event_name(&tk->tp.call), + mod->name, ret); + } + } + mutex_unlock(&probe_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, + .priority = 1 /* Invoked after kprobe module callback */ +}; + +static int create_trace_kprobe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_kprobe *tk; + int i, ret = 0; + bool is_return = false, is_delete = false; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + /* argc must be >= 1 */ + if (argv[0][0] == 'p') + is_return = false; + else if (argv[0][0] == 'r') + is_return = true; + else if (argv[0][0] == '-') + is_delete = true; + else { + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = KPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&probe_lock); + tk = find_trace_kprobe(event, group); + if (!tk) { + mutex_unlock(&probe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + ret = unregister_trace_kprobe(tk); + if (ret == 0) + free_trace_kprobe(tk); + mutex_unlock(&probe_lock); + return ret; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = traceprobe_split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return PTR_ERR(tk); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + /* Increment count for freeing args in error case */ + tk->tp.nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + parg->name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + parg->name = kstrdup(buf, GFP_KERNEL); + } + + if (!parg->name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, parg->name); + ret = -EINVAL; + goto error; + } + + if (traceprobe_conflict_field_name(parg->name, + tk->tp.args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, + is_return, true, + kprobes_fetch_type_table); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_kprobe(tk); + if (ret) + goto error; + return 0; + +error: + free_trace_kprobe(tk); + return ret; +} + +static int release_all_trace_kprobes(void) +{ + struct trace_kprobe *tk; + int ret = 0; + + mutex_lock(&probe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tk, &probe_list, list) + if (trace_probe_is_enabled(&tk->tp)) { + ret = -EBUSY; + goto end; + } + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tk = list_entry(probe_list.next, struct trace_kprobe, list); + ret = unregister_trace_kprobe(tk); + if (ret) + goto end; + free_trace_kprobe(tk); + } + +end: + mutex_unlock(&probe_lock); + + return ret; +} + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_kprobe *tk = v; + int i; + + seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, + ftrace_event_name(&tk->tp.call)); + + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); + else + seq_printf(m, " %s", trace_kprobe_symbol(tk)); + + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_trace_kprobes(); + if (ret < 0) + return ret; + } + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return traceprobe_probes_write(file, buffer, count, ppos, + create_trace_kprobe); +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_kprobe *tk = v; + + seq_printf(m, " %-44s %15lu %15lu\n", + ftrace_event_name(&tk->tp.call), tk->nhit, + tk->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static nokprobe_inline void +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) +{ + struct kprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, dsize, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tk->tp.call; + + WARN_ON(call != ftrace_file->event_call); + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; + + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); +} + +static void +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ + struct event_file_link *link; + + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kprobe_trace_func(tk, regs, link->file); +} +NOKPROBE_SYMBOL(kprobe_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) +{ + struct kretprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, pc, dsize; + unsigned long irq_flags; + struct ftrace_event_call *call = &tk->tp.call; + + WARN_ON(call != ftrace_file->event_call); + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; + + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->func = (unsigned long)tk->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); +} + +static void +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct event_file_link *link; + + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kretprobe_trace_func(tk, ri, regs, link->file); +} +NOKPROBE_SYMBOL(kretprobe_trace_func); + +/* Event entry printers */ +static enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kretprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kretprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry_head field; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + /* Set argument names as fields */ + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry_head field; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + /* Set argument names as fields */ + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +#ifdef CONFIG_PERF_EVENTS + +/* Kprobe profile handler */ +static void +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ + struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; + struct kprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + if (prog && !trace_call_bpf(prog, regs)) + return; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + if (!entry) + return; + + entry->ip = (unsigned long)tk->rp.kp.addr; + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); +} +NOKPROBE_SYMBOL(kprobe_perf_func); + +/* Kretprobe profile handler */ +static void +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; + struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + if (prog && !trace_call_bpf(prog, regs)) + return; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + if (!entry) + return; + + entry->func = (unsigned long)tk->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); +} +NOKPROBE_SYMBOL(kretprobe_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ +static int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)event->data; + struct ftrace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_kprobe(tk, file); + case TRACE_REG_UNREGISTER: + return disable_trace_kprobe(tk, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_kprobe(tk, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_kprobe(tk, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + + tk->nhit++; + + if (tk->tp.flags & TP_FLAG_TRACE) + kprobe_trace_func(tk, regs); +#ifdef CONFIG_PERF_EVENTS + if (tk->tp.flags & TP_FLAG_PROFILE) + kprobe_perf_func(tk, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} +NOKPROBE_SYMBOL(kprobe_dispatcher); + +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); + + tk->nhit++; + + if (tk->tp.flags & TP_FLAG_TRACE) + kretprobe_trace_func(tk, ri, regs); +#ifdef CONFIG_PERF_EVENTS + if (tk->tp.flags & TP_FLAG_PROFILE) + kretprobe_perf_func(tk, ri, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} +NOKPROBE_SYMBOL(kretprobe_dispatcher); + +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct ftrace_event_call *call = &tk->tp.call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + if (trace_kprobe_is_return(tk)) { + call->event.funcs = &kretprobe_funcs; + call->class->define_fields = kretprobe_event_define_fields; + } else { + call->event.funcs = &kprobe_funcs; + call->class->define_fields = kprobe_event_define_fields; + } + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) + return -ENOMEM; + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", + ftrace_event_name(call)); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + return ret; +} + +static int unregister_kprobe_event(struct trace_kprobe *tk) +{ + int ret; + + /* tp->event is unregistered in trace_remove_event_call() */ + ret = trace_remove_event_call(&tk->tp.call); + if (!ret) + kfree(tk->tp.call.print_fmt); + return ret; +} + +/* Make a tracefs interface for controlling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + if (register_module_notifier(&trace_kprobe_module_nb)) + return -EINVAL; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + entry = tracefs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create tracefs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create tracefs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static struct ftrace_event_file * +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) + if (file->event_call == &tk->tp.call) + return file; + + return NULL; +} + +/* + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this + * stage, we can do this lockless. + */ +static __init int kprobe_trace_self_tests_init(void) +{ + int ret, warn = 0; + int (*target)(int, int, int, int, int, int); + struct trace_kprobe *tk; + struct ftrace_event_file *file; + + if (tracing_is_disabled()) + return -ENODEV; + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)", + create_trace_kprobe); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on probing function entry.\n"); + warn++; + } else { + /* Enable trace point */ + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting new probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe(tk, file); + } + } + + ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " + "$retval", create_trace_kprobe); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on probing function return.\n"); + warn++; + } else { + /* Enable trace point */ + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd new probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe(tk, file); + } + } + + if (warn) + goto end; + + ret = target(1, 2, 3, 4, 5, 6); + + /* Disable trace points before removing it */ + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting test probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe(tk, file); + } + + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd test probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe(tk, file); + } + + ret = traceprobe_command("-:testprobe", create_trace_kprobe); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on deleting a probe.\n"); + warn++; + } + + ret = traceprobe_command("-:testprobe2", create_trace_kprobe); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on deleting a probe.\n"); + warn++; + } + +end: + release_all_trace_kprobes(); + if (warn) + pr_cont("NG: Some tests are failed. Please check them.\n"); + else + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 000000000..7a9ba62e9 --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,364 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi> + */ + +#define DEBUG 1 + +#include <linux/kernel.h> +#include <linux/mmiotrace.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/time.h> + +#include <linux/atomic.h> + +#include "trace.h" +#include "trace_output.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; +static unsigned long prev_overruns; +static atomic_t dropped_count; + +static void mmio_reset_data(struct trace_array *tr) +{ + overrun_detected = false; + prev_overruns = 0; + + tracing_reset_online_cpus(&tr->trace_buffer); +} + +static int mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + + mmio_reset_data(tr); + enable_mmiotrace(); + return 0; +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_start(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_reset_data(tr); +} + +static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + trace_seq_printf(s, " %s\n", drv->name); + else + trace_seq_puts(s, " \n"); +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_puts(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + unsigned long cnt = atomic_xchg(&dropped_count, 0); + unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); + + if (over > prev_overruns) + cnt += over - prev_overruns; + prev_overruns = over; + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + trace_assign_type(field, entry); + rw = &field->rw; + + switch (rw->opcode) { + case MMIO_READ: + trace_seq_printf(s, + "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + trace_seq_printf(s, + "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + trace_seq_printf(s, + "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," + "%02lx 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + trace_seq_puts(s, "rw what?\n"); + break; + } + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + trace_assign_type(field, entry); + m = &field->map; + + switch (m->opcode) { + case MMIO_PROBE: + trace_seq_printf(s, + "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + trace_seq_printf(s, + "UNMAP %u.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + trace_seq_puts(s, "map what?\n"); + break; + } + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + /* The trailing newline must be in the message. */ + trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); + default: + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .start = mmio_trace_start, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .print_line = mmio_print_line, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct ftrace_event_call *call = &event_mmiotrace_rw; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; + int pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, + sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->rw = *rw; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); + __trace_mmiotrace_rw(tr, data, rw); +} + +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct ftrace_event_call *call = &event_mmiotrace_map; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; + int pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, + sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->map = *map; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 000000000..8bb207147 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,99 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net> + * + */ + +#include <linux/module.h> +#include <linux/ftrace.h> + +#include "trace.h" + +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static int nop_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + start_nop_trace(tr); + return 0; +} + +static void nop_trace_reset(struct trace_array *tr) +{ + stop_nop_trace(tr); +} + +/* It only serves as a signal handler and a callback to + * accept or refuse tthe setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + "Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; +} + + +struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif + .flags = &nop_flags, + .set_flag = nop_set_flag, + .allow_instances = true, +}; + diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c new file mode 100644 index 000000000..25a086bcb --- /dev/null +++ b/kernel/trace/trace_output.c @@ -0,0 +1,1256 @@ +/* + * trace_output.c + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + */ + +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ftrace.h> + +#include "trace_output.h" + +/* must be a power of 2 */ +#define EVENT_HASHSIZE 128 + +DECLARE_RWSEM(trace_event_sem); + +static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; + +static int next_event_type = __TRACE_LAST_TYPE + 1; + +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + +enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + + trace_assign_type(field, entry); + + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + +enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, field->buf); + + return trace_handle_return(s); +} + +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_flags_seq); + +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (ret == (const char *)(trace_seq_buffer_ptr(p))) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq); + +#if BITS_PER_LONG == 32 +const char * +ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, + const struct trace_print_flags_u64 *symbol_array) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (ret == (const char *)(trace_seq_buffer_ptr(p))) + trace_seq_printf(p, "0x%llx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +#endif + +const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + +const char * +ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_hex_seq); + +const char * +ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count, + size_t el_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + const char *prefix = ""; + void *ptr = (void *)buf; + size_t buf_len = count * el_size; + + trace_seq_putc(p, '{'); + + while (ptr < buf + buf_len) { + switch (el_size) { + case 1: + trace_seq_printf(p, "%s0x%x", prefix, + *(u8 *)ptr); + break; + case 2: + trace_seq_printf(p, "%s0x%x", prefix, + *(u16 *)ptr); + break; + case 4: + trace_seq_printf(p, "%s0x%x", prefix, + *(u32 *)ptr); + break; + case 8: + trace_seq_printf(p, "%s0x%llx", prefix, + *(u64 *)ptr); + break; + default: + trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size, + *(u8 *)ptr); + el_size = 1; + } + prefix = ","; + ptr += el_size; + } + + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_array_seq); + +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct ftrace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + + event = container_of(trace_event, struct ftrace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + trace_seq_printf(s, "%s: ", ftrace_event_name(event)); + + return trace_handle_return(s); +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + +static int ftrace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "%s: ", name); + trace_seq_vprintf(s, fmt, ap); + + return trace_handle_return(s); +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ftrace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); + +#ifdef CONFIG_KRETPROBES +static inline const char *kretprobed(const char *name) +{ + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; +} +#else +static inline const char *kretprobed(const char *name) +{ + return name; +} +#endif /* CONFIG_KRETPROBES */ + +static void +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + name = kretprobed(str); + + trace_seq_printf(s, fmt, name); +#endif +} + +static void +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + sprint_symbol(str, address); + name = kretprobed(str); + + trace_seq_printf(s, fmt, name); +#endif +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (s->full) + return 0; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + trace_seq_printf(s, " <" IP_FMT ">", ip); + return !trace_seq_has_overflowed(s); +} + +int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + + if (!ip) { + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); + continue; + } + + seq_print_user_ip(s, mm, ip, sym_flags); + trace_seq_putc(s, '\n'); + } + + if (mm) + mmput(mm); + + return !trace_seq_has_overflowed(s); +} + +int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + if (!ip) { + trace_seq_putc(s, '0'); + goto out; + } + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + seq_print_sym_offset(s, "%s", ip); + else + seq_print_sym_short(s, "%s", ip); + + if (sym_flags & TRACE_ITER_SYM_ADDR) + trace_seq_printf(s, " <" IP_FMT ">", ip); + + out: + return !trace_seq_has_overflowed(s); +} + +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; + default: + need_resched = '.'; + break; + } + + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + + trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq); + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_putc(s, '.'); + + return !trace_seq_has_overflowed(s); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu); + + return trace_print_lat_fmt(s, entry); +} + +#undef MARK +#define MARK(v, s) {.val = v, .sym = s} +/* trace overhead mark */ +static const struct trace_mark { + unsigned long long val; /* unit: nsec */ + char sym; +} mark[] = { + MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(1000000ULL , '#'), /* 1000 usecs */ + MARK(100000ULL , '!'), /* 100 usecs */ + MARK(10000ULL , '+'), /* 10 usecs */ +}; +#undef MARK + +char trace_find_mark(unsigned long long d) +{ + int i; + int size = ARRAY_SIZE(mark); + + for (i = 0; i < size; i++) { + if (d >= mark[i].val) + break; + } + + return (i == size) ? ' ' : mark[i].sym; +} + +static int +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) +{ + unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; + unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; + unsigned long long rel_ts = next_ts - iter->ts; + struct trace_seq *s = &iter->seq; + + if (in_ns) { + abs_ts = ns2usecs(abs_ts); + rel_ts = ns2usecs(rel_ts); + } + + if (verbose && in_ns) { + unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); + unsigned long abs_msec = (unsigned long)abs_ts; + unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); + unsigned long rel_msec = (unsigned long)rel_ts; + + trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + + } else if (verbose && !in_ns) { + trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + + } else if (!verbose && in_ns) { + trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + trace_find_mark(rel_ts * NSEC_PER_USEC)); + + } else { /* !verbose && !in_ns */ + trace_seq_printf(s, " %4lld: ", abs_ts); + } + + return !trace_seq_has_overflowed(s); +} + +int trace_print_context(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + unsigned long long t; + unsigned long secs, usec_rem; + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf(s, "%16s-%-5d [%03d] ", + comm, entry->pid, iter->cpu); + + if (trace_flags & TRACE_ITER_IRQ_INFO) + trace_print_lat_fmt(s, entry); + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(iter->ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu: ", iter->ts); + + return !trace_seq_has_overflowed(s); +} + +int trace_print_lat_context(struct trace_iterator *iter) +{ + u64 next_ts; + /* trace_find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent, + *next_entry = trace_find_next_entry(iter, NULL, + &next_ts); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + + /* Restore the original ent_size */ + iter->ent_size = ent_size; + + if (!next_entry) + next_ts = iter->ts; + + if (verbose) { + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); + } else { + lat_print_generic(s, entry, iter->cpu); + } + + lat_print_timestamp(iter, next_ts); + + return !trace_seq_has_overflowed(s); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + +/** + * ftrace_find_event - find a registered event + * @type: the type of event to look for + * + * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. + */ +struct trace_event *ftrace_find_event(int type) +{ + struct trace_event *event; + unsigned key; + + key = type & (EVENT_HASHSIZE - 1); + + hlist_for_each_entry(event, &event_hash[key], node) { + if (event->type == type) + return event; + } + + return NULL; +} + +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + +void trace_event_read_lock(void) +{ + down_read(&trace_event_sem); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_sem); +} + +/** + * register_ftrace_event - register output for an event type + * @event: the event type to register + * + * Event types are stored in a hash and this hash is used to + * find a way to print an event. If the @event->type is set + * then it will use that type, otherwise it will assign a + * type to use. + * + * If you assign your own type, please make sure it is added + * to the trace_type enum in trace.h, to avoid collisions + * with the dynamic types. + * + * Returns the event type number or zero on error. + */ +int register_ftrace_event(struct trace_event *event) +{ + unsigned key; + int ret = 0; + + down_write(&trace_event_sem); + + if (WARN_ON(!event)) + goto out; + + if (WARN_ON(!event->funcs)) + goto out; + + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list = NULL; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { + printk(KERN_WARNING "Need to add type to trace.h\n"); + WARN_ON(1); + goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } + + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; + + key = event->type & (EVENT_HASHSIZE - 1); + + hlist_add_head(&event->node, &event_hash[key]); + + ret = event->type; + out: + up_write(&trace_event_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_event); + +/* + * Used by module code with the trace_event_sem held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} + +/** + * unregister_ftrace_event - remove a no longer used event + * @event: the event to remove + */ +int unregister_ftrace_event(struct trace_event *event) +{ + down_write(&trace_event_sem); + __unregister_ftrace_event(event); + up_write(&trace_event_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_event); + +/* + * Standard events + */ + +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); + + return trace_handle_return(&iter->seq); +} + +/* TRACE_FN */ +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + seq_print_ip_sym(s, field->ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); + } + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->ip); + SEQ_PUT_HEX_FIELD(s, field->parent_ip); + + return trace_handle_return(s); +} + +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->ip); + SEQ_PUT_FIELD(s, field->parent_ip); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_fn_funcs = { + .trace = trace_fn_trace, + .raw = trace_fn_raw, + .hex = trace_fn_hex, + .binary = trace_fn_bin, +}; + +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + +/* TRACE_CTX an TRACE_WAKE */ +static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, + char *delim) +{ + struct ctx_switch_entry *field; + char comm[TASK_COMM_LEN]; + int S, T; + + + trace_assign_type(field, iter->ent); + + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); + trace_find_cmdline(field->next_pid, comm); + trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_print(iter, "==>"); +} + +static enum print_line_t trace_wake_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return trace_ctxwake_print(iter, " +"); +} + +static int trace_ctxwake_raw(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_state_char(field->prev_state); + T = task_state_char(field->next_state); + trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, 0); +} + +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, '+'); +} + + +static int trace_ctxwake_hex(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_state_char(field->prev_state); + T = task_state_char(field->next_state); + + SEQ_PUT_HEX_FIELD(s, field->prev_pid); + SEQ_PUT_HEX_FIELD(s, field->prev_prio); + SEQ_PUT_HEX_FIELD(s, S); + SEQ_PUT_HEX_FIELD(s, field->next_cpu); + SEQ_PUT_HEX_FIELD(s, field->next_pid); + SEQ_PUT_HEX_FIELD(s, field->next_prio); + SEQ_PUT_HEX_FIELD(s, T); + + return trace_handle_return(s); +} + +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, 0); +} + +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, '+'); +} + +static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->prev_pid); + SEQ_PUT_FIELD(s, field->prev_prio); + SEQ_PUT_FIELD(s, field->prev_state); + SEQ_PUT_FIELD(s, field->next_cpu); + SEQ_PUT_FIELD(s, field->next_pid); + SEQ_PUT_FIELD(s, field->next_prio); + SEQ_PUT_FIELD(s, field->next_state); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_ctx_funcs = { + .trace = trace_ctx_print, + .raw = trace_ctx_raw, + .hex = trace_ctx_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { + .trace = trace_wake_print, + .raw = trace_wake_raw, + .hex = trace_wake_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, +}; + +/* TRACE_STACK */ + +static enum print_line_t trace_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct stack_entry *field; + struct trace_seq *s = &iter->seq; + unsigned long *p; + unsigned long *end; + + trace_assign_type(field, iter->ent); + end = (unsigned long *)((long)iter->ent + iter->ent_size); + + trace_seq_puts(s, "<stack trace>\n"); + + for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { + + if (trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + seq_print_ip_sym(s, *p, flags); + trace_seq_putc(s, '\n'); + } + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_stack_funcs = { + .trace = trace_stack_print, +}; + +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .funcs = &trace_stack_funcs, +}; + +/* TRACE_USER_STACK */ +static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct userstack_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_puts(s, "<user stack trace>\n"); + seq_print_userip_objs(field, s, flags); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_user_stack_funcs = { + .trace = trace_user_stack_print, +}; + +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .funcs = &trace_user_stack_funcs, +}; + +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + +/* TRACE_BPRINT */ +static enum print_line_t +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprint_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bprint_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; + +static struct trace_event trace_bprint_event = { + .type = TRACE_BPRINT, + .funcs = &trace_bprint_funcs, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_printf(s, ": %s", field->buf); + + return trace_handle_return(s); +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct print_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_print_funcs = { + .trace = trace_print_print, + .raw = trace_print_raw, +}; + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + + +static struct trace_event *events[] __initdata = { + &trace_fn_event, + &trace_ctx_event, + &trace_wake_event, + &trace_stack_event, + &trace_user_stack_event, + &trace_bputs_event, + &trace_bprint_event, + &trace_print_event, + NULL +}; + +__init static int init_events(void) +{ + struct trace_event *event; + int i, ret; + + for (i = 0; events[i]; i++) { + event = events[i]; + + ret = register_ftrace_event(event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + event->type); + WARN_ON_ONCE(1); + } + } + + return 0; +} +early_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h new file mode 100644 index 000000000..8ef2c40ef --- /dev/null +++ b/kernel/trace/trace_output.h @@ -0,0 +1,45 @@ +#ifndef __TRACE_EVENTS_H +#define __TRACE_EVENTS_H + +#include <linux/trace_seq.h> +#include "trace.h" + +extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_bprintk_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_printk_msg_only(struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); +extern int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); + +extern int trace_print_context(struct trace_iterator *iter); +extern int trace_print_lat_context(struct trace_iterator *iter); + +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); +extern struct trace_event *ftrace_find_event(int type); + +extern enum print_line_t trace_nop_print(struct trace_iterator *iter, + int flags, struct trace_event *event); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); + +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_sem; + +#define SEQ_PUT_FIELD(s, x) \ + trace_seq_putmem(s, &(x), sizeof(x)) + +#define SEQ_PUT_HEX_FIELD(s, x) \ + trace_seq_putmem_hex(s, &(x), sizeof(x)) + +#endif + diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c new file mode 100644 index 000000000..36c1455b7 --- /dev/null +++ b/kernel/trace/trace_printk.c @@ -0,0 +1,366 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com> + * + */ +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/slab.h> + +#include "trace.h" + +#ifdef CONFIG_MODULES + +/* + * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +/* serialize accesses to trace_bprintk_fmt_list */ +static DEFINE_MUTEX(btrace_mutex); + +struct trace_bprintk_fmt { + struct list_head list; + const char *fmt; +}; + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) +{ + struct trace_bprintk_fmt *pos; + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; +} + +static +void hold_module_trace_bprintk_format(const char **start, const char **end) +{ + const char **iter; + char *fmt; + + /* allocate the trace_printk per cpu buffers */ + if (start != end) + trace_printk_init_buffers(); + + mutex_lock(&btrace_mutex); + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + *iter = tb_fmt->fmt; + continue; + } + + fmt = NULL; + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) { + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); + } + *iter = fmt; + + } + mutex_unlock(&btrace_mutex); +} + +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return 0; +} + +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } +#endif /* CONFIG_MODULES */ + + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + +int __trace_bprintk(unsigned long ip, const char *fmt, ...) + { + int ret; + va_list ap; + + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vbprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bprintk); + +int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) + { + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vbprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vbprintk); + +int __trace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + int last_index; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return __start___trace_bprintk_fmt + *pos; + + /* + * The __tracepoint_str section is treated the same as the + * __trace_printk_fmt section. The difference is that the + * __trace_printk_fmt section should only be used by trace_printk() + * in a debugging environment, as if anything exists in that section + * the trace_prink() helper buffers are allocated, which would just + * waste space in a production environment. + * + * The __tracepoint_str sections on the other hand are used by + * tracepoints which need to map pointers to their strings to + * the ASCII text for userspace. + */ + last_index = start_index; + start_index = __stop___tracepoint_str - __start___tracepoint_str; + + if (*pos < last_index + start_index) + return __start___tracepoint_str + (*pos - last_index); + + return find_next_mod_format(start_index, v, fmt, pos); +} + +static void * +t_start(struct seq_file *m, loff_t *pos) +{ + format_mod_start(); + return find_next(NULL, pos); +} + +static void *t_next(struct seq_file *m, void * v, loff_t *pos) +{ + (*pos)++; + return find_next(v, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + const char **fmt = v; + const char *str = *fmt; + int i; + + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); + + /* + * Tabs and new lines need to be converted. + */ + for (i = 0; str[i]; i++) { + switch (str[i]) { + case '\n': + seq_puts(m, "\\n"); + break; + case '\t': + seq_puts(m, "\\t"); + break; + case '\\': + seq_putc(m, '\\'); + break; + case '"': + seq_puts(m, "\\\""); + break; + default: + seq_putc(m, str[i]); + } + } + seq_puts(m, "\"\n"); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + format_mod_stop(); +} + +static const struct seq_operations show_format_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static int +ftrace_formats_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &show_format_seq_ops); +} + +static const struct file_operations ftrace_formats_fops = { + .open = ftrace_formats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int init_trace_printk_function_export(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + trace_create_file("printk_formats", 0444, d_tracer, + NULL, &ftrace_formats_fops); + + return 0; +} + +fs_initcall(init_trace_printk_function_export); + +static __init int init_trace_printk(void) +{ + return register_module_notifier(&module_trace_bprintk_format_nb); +} + +early_initcall(init_trace_printk); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 000000000..1769a81da --- /dev/null +++ b/kernel/trace/trace_probe.c @@ -0,0 +1,723 @@ +/* + * Common code for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include "trace_probe.h" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent) \ +{ \ + trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + return !trace_seq_has_overflowed(s); \ +} \ +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") + +/* Print type function for string type */ +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + trace_seq_printf(s, " %s=(fault)", name); + else + trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); + return !trace_seq_has_overflowed(s); +} +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); + +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +#define DEFINE_FETCH_retval(type) \ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; + long offset; + fetch_func_t fetch; + fetch_func_t fetch_size; +}; + +#define DEFINE_FETCH_deref(type) \ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + dprm->fetch(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) + +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + struct deref_fetch_param *dprm = data; + unsigned long addr; + + call_fetch(&dprm->orig, regs, &addr); + if (addr && dprm->fetch_size) { + addr += dprm->offset; + dprm->fetch_size(regs, (void *)addr, dest); + } else + *(string_size *)dest = 0; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); + +static void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} +NOKPROBE_SYMBOL(update_deref_fetch_param); + +static void free_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} +NOKPROBE_SYMBOL(free_deref_fetch_param); + +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + + kfree(data); +} + +static const struct fetch_type *find_fetch_type(const char *type, + const struct fetch_type *ftbl) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + + type = strchr(type, '/'); + if (!type) + goto fail; + + type++; + if (kstrtoul(type, 0, &bs)) + goto fail; + + switch (bs) { + case 8: + return find_fetch_type("u8", ftbl); + case 16: + return find_fetch_type("u16", ftbl); + case 32: + return find_fetch_type("u32", ftbl); + case 64: + return find_fetch_type("u64", ftbl); + default: + goto fail; + } + } + + for (i = 0; ftbl[i].name; i++) { + if (strcmp(type, ftbl[i].name) == 0) + return &ftbl[i]; + } + +fail: + return NULL; +} + +/* Special function : only accept unsigned long */ +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_kernel_stack_address); + +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ + *(unsigned long *)dest = user_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_user_stack_address); + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn, + const struct fetch_type *ftbl) +{ + int i; + + if (type != &ftbl[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + + return NULL; +} + +/* Split symbol and offset. */ +int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because kstrtoul doesn't accept '+' */ + ret = kstrtoul(tmp + 1, 0, offset); + if (ret) + return ret; + + *tmp = '\0'; + } else + *offset = 0; + + return 0; +} + +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return, + bool is_kprobe) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) + f->fn = t->fetch[FETCH_MTD_retval]; + else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) + return -EINVAL; + + if (is_kprobe) + f->fn = fetch_kernel_stack_address; + else + f->fn = fetch_user_stack_address; + } else if (isdigit(arg[5])) { + ret = kstrtoul(arg + 5, 10, ¶m); + if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + ret = -EINVAL; + else { + f->fn = t->fetch[FETCH_MTD_stack]; + f->data = (void *)param; + } + } else + ret = -EINVAL; + } else + ret = -EINVAL; + + return ret; +} + +/* Recursive argument parser */ +static int parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) +{ + unsigned long param; + long offset; + char *tmp; + int ret = 0; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + break; + + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + f->fn = t->fetch[FETCH_MTD_reg]; + f->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + + case '@': /* memory, file-offset or symbol */ + if (isdigit(arg[1])) { + ret = kstrtoul(arg + 1, 0, ¶m); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_memory]; + f->data = (void *)param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (is_kprobe) + return -EINVAL; + + ret = kstrtol(arg + 2, 0, &offset); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_file_offset]; + f->data = (void *)offset; + } else { + /* uprobes don't support symbols */ + if (!is_kprobe) + return -EINVAL; + + ret = traceprobe_split_symbol_offset(arg + 1, &offset); + if (ret) + break; + + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->fetch[FETCH_MTD_symbol]; + } + break; + + case '+': /* deref memory */ + arg++; /* Skip '+', because kstrtol() rejects it. */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) + break; + + *tmp = '\0'; + ret = kstrtol(arg, 0, &offset); + + if (ret) + break; + + arg = tmp + 1; + tmp = strrchr(arg, ')'); + + if (tmp) { + struct deref_fetch_param *dprm; + const struct fetch_type *t2; + + t2 = find_fetch_type(NULL, ftbl); + *tmp = '\0'; + dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); + + if (!dprm) + return -ENOMEM; + + dprm->offset = offset; + dprm->fetch = t->fetch[FETCH_MTD_memory]; + dprm->fetch_size = get_fetch_size_function(t, + dprm->fetch, ftbl); + ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, + is_kprobe, ftbl); + if (ret) + kfree(dprm); + else { + f->fn = t->fetch[FETCH_MTD_deref]; + f->data = (void *)dprm; + } + } + break; + } + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", t->name); + ret = -EINVAL; + } + + return ret; +} + +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) +{ + const char *t; + int ret; + + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t, ftbl); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = *size; + *size += parg->type->size; + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, + is_kprobe, ftbl); + + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); + + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn, + ftbl); + parg->fetch_size.data = parg->fetch.data; + } + + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + + return 0; +} + +void traceprobe_update_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + +void traceprobe_free_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + free_symbol_cache(arg->fetch.data); + + kfree(arg->name); + kfree(arg->comm); +} + +int traceprobe_command(const char *buf, int (*createfn)(int, char **)) +{ + char **argv; + int argc, ret; + + argc = 0; + ret = 0; + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, argv); + + argv_free(argv); + + return ret; +} + +#define WRITE_BUFSIZE 4096 + +ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(int, char **)) +{ + char *kbuf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = traceprobe_command(kbuf, createfn); + if (ret) + goto out; + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + bool is_return) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!is_return) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, is_return); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, is_return); + tp->call.print_fmt = print_fmt; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 000000000..ab283e146 --- /dev/null +++ b/kernel/trace/trace_probe.h @@ -0,0 +1,394 @@ +/* + * Common header file for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.h written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/tracefs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 + + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +static nokprobe_inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); + +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_bitfield, + FETCH_MTD_file_offset, + FETCH_MTD_END, +}; + +/* Fetch type information table */ +struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ + /* Fetch functions */ + fetch_func_t fetch[FETCH_MTD_END]; +}; + +struct fetch_param { + fetch_func_t fn; + void *data; +}; + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent); \ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type) \ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ + void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) \ +DECLARE_FETCH_FUNC(method, u8); \ +DECLARE_FETCH_FUNC(method, u16); \ +DECLARE_FETCH_FUNC(method, u32); \ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ +ASSIGN_FETCH_FUNC(file_offset, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8 NULL +#define fetch_symbol_u16 NULL +#define fetch_symbol_u32 NULL +#define fetch_symbol_u64 NULL +#define fetch_symbol_string NULL +#define fetch_symbol_string_size NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ + return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ + return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + +struct probe_arg { + struct fetch_param fetch; + struct fetch_param fetch_size; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ +}; + +struct trace_probe { + unsigned int flags; /* For TP_FLAG_* */ + struct ftrace_event_class class; + struct ftrace_event_call call; + struct list_head files; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static nokprobe_inline void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) +{ + return fprm->fn(regs, fprm->data, dest); +} + +/* Check the name is good for event/group/fields */ +static inline int is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ + struct event_file_link *link; + + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; + + return NULL; +} + +extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl); + +extern int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg); + +extern void traceprobe_update_arg(struct probe_arg *arg); +extern void traceprobe_free_probe_arg(struct probe_arg *arg); + +extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); + +extern ssize_t traceprobe_probes_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(int, char**)); + +extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 000000000..419ca37e7 --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,101 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <trace/events/sched.h> + +#include "trace.h" + +static int sched_ref; +static DEFINE_MUTEX(sched_register_mutex); + +static void +probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); +} + +static void +probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +{ + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(current); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); + return ret; +} + +static void tracing_sched_unregister(void) +{ + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); +} + +static void tracing_start_sched_switch(void) +{ + mutex_lock(&sched_register_mutex); + if (!(sched_ref++)) + tracing_sched_register(); + mutex_unlock(&sched_register_mutex); +} + +static void tracing_stop_sched_switch(void) +{ + mutex_lock(&sched_register_mutex); + if (!(--sched_ref)) + tracing_sched_unregister(); + mutex_unlock(&sched_register_mutex); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 000000000..d6e100372 --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,816 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <trace/events/sched.h> +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static int wakeup_current_cpu; +static unsigned wakeup_prio = -1; +static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; + +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static void wakeup_reset(struct trace_array *tr); +static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); + +static int save_flags; +static bool function_enabled; + +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + +#ifdef CONFIG_FUNCTION_TRACER + +/* + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) +{ + long disabled; + int cpu; + + if (likely(!wakeup_task)) + return 0; + + *pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + disabled = atomic_inc_return(&(*data)->disabled); + if (unlikely(disabled != 1)) + goto out; + + return 1; + +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} + +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); + local_irq_restore(flags); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} +#endif /* CONFIG_FUNCTION_TRACER */ + +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static void wakeup_function_set(struct trace_array *tr, int set) +{ + if (set) + register_wakeup_function(tr, is_graph(), 1); + else + unregister_wakeup_function(tr, is_graph()); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (mask & TRACE_ITER_FUNCTION) + wakeup_function_set(tr, set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_wakeup_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_wakeup_function(tr, graph); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(tr, !set); + + wakeup_reset(wakeup_trace); + tr->max_latency = 0; + + return start_func_tracer(tr, set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(struct trace_array *tr, cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tr->max_latency) + return 0; + } + return 1; +} + +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + +static void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void notrace +probe_wakeup_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + int pc; + + tracing_record_cmdline(prev); + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + pc = preempt_count(); + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + if (likely(disabled != 1)) + goto out; + + local_irq_save(flags); + arch_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + /* The task we are waiting for is waking up */ + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); + + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(wakeup_trace, delta)) + goto out_unlock; + + if (likely(!is_tracing_stopped())) { + wakeup_trace->max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } + +out_unlock: + __wakeup_reset(wakeup_trace); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +out: + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + wakeup_cpu = -1; + wakeup_prio = -1; + tracing_dl = 0; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + tracing_reset_online_cpus(&tr->trace_buffer); + + local_irq_save(flags); + arch_spin_lock(&wakeup_lock); + __wakeup_reset(tr); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +} + +static void +probe_wakeup(void *ignore, struct task_struct *p, int success) +{ + struct trace_array_cpu *data; + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + int pc; + + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); + + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + + pc = preempt_count(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + arch_spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(wakeup_trace); + + wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; + wakeup_prio = p->prio; + + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); + data->preempt_timestamp = ftrace_now(cpu); + tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + +out_locked: + arch_spin_unlock(&wakeup_lock); +out: + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + if (start_func_tracer(tr, is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); + + return; +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_wakeup, NULL); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + stop_func_tracer(tr, is_graph()); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); +} + +static bool wakeup_busy; + +static int __wakeup_tracer_init(struct trace_array *tr) +{ + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + + tr->max_latency = 0; + wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); + start_wakeup_tracer(tr); + + wakeup_busy = true; + return 0; +} + +static int wakeup_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 0; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static int wakeup_rt_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 0; + wakeup_rt = 1; + return __wakeup_tracer_init(tr); +} + +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + wakeup_busy = false; +} + +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; +} + +static void wakeup_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +static struct tracer wakeup_rt_tracer __read_mostly = +{ + .name = "wakeup_rt", + .init = wakeup_rt_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + ret = register_tracer(&wakeup_rt_tracer); + if (ret) + return ret; + + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + + return 0; +} +core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 000000000..287cf721c --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,1220 @@ +/* Include in trace.c */ + +#include <linux/stringify.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/slab.h> + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_PRINT: + case TRACE_BRANCH: + case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RET: + return 1; + } + return 0; +} + +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) +{ + struct ring_buffer_event *event; + struct trace_entry *entry; + unsigned int loops = 0; + + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { + entry = ring_buffer_event_data(event); + + /* + * The ring buffer is a size of trace_buf_size, if + * we loop more than the size, there's something wrong + * with the ring buffer. + */ + if (loops++ > trace_buf_size) { + printk(KERN_CONT ".. bad ring buffer "); + goto failed; + } + if (!trace_valid_entry(entry)) { + printk(KERN_CONT ".. invalid entry %d ", + entry->type); + goto failed; + } + } + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + local_irq_save(flags); + arch_spin_lock(&buf->tr->max_lock); + + cnt = ring_buffer_entries(buf->buffer); + + /* + * The trace_test_buffer_cpu runs a while loop to consume all data. + * If the calling tracer is broken, and is constantly filling + * the buffer, this will run forever, and hard lock the box. + * We disable the ring buffer while we do this test to prevent + * a hard lock up. + */ + tracing_off(); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(buf, cpu); + if (ret) + break; + } + tracing_on(); + arch_spin_unlock(&buf->tr->max_lock); + local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(struct trace_array *tr, int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + +/* Test dynamic code modification and ftrace filters */ +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + int save_ftrace_enabled = ftrace_enabled; + unsigned long count; + char *func_name; + int ret; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; + tracing_start(); + + /* we should only have one item */ + if (!ret && count != 1) { + trace->reset(tr); + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(tr, 1); + trace->reset(tr); + + out: + ftrace_enabled = save_ftrace_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); + + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(tr, 2); + + return ret; +} + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + if (trace_selftest_recursion_cnt++ > 10) + return; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +__init int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + int save_ftrace_enabled = ftrace_enabled; + unsigned long count; + int ret; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + + /* make sure msleep has been recorded */ + msleep(1); + + /* start the tracing */ + ftrace_enabled = 1; + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + + ret = trace_selftest_function_recursion(); + if (ret) + goto out; + + ret = trace_selftest_function_regs(); + out: + ftrace_enabled = save_ftrace_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FUNCTION_TRACER */ + + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Maximum number of functions to trace before diagnosing a hang */ +#define GRAPH_MAX_FUNC_TEST 100000000 + +static unsigned int graph_hang_thresh; + +/* Wrap the real function entry probe to avoid possible hanging */ +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +{ + /* This is harmlessly racy, we want to approximately detect a hang */ + if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { + ftrace_graph_stop(); + printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } + return 0; + } + + return trace_graph_entry(trace); +} + +/* + * Pretty much the same than for the function tracer from which the selftest + * has been borrowed. + */ +__init int +trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr) +{ + int ret; + unsigned long count; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + + /* + * Simulate the init() callback but we attach a watchdog callback + * to detect and recover from possible hangs + */ + tracing_reset_online_cpus(&tr->trace_buffer); + set_graph_array(tr); + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry_watchdog); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + tracing_start_cmdline_record(); + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* Have we just recovered from a hang? */ + if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { + tracing_selftest_disabled = true; + ret = -1; + goto out; + } + + tracing_stop(); + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* Don't test dynamic tracing, the function tracer already did */ + +out: + /* Stop it if we failed */ + if (ret) + ftrace_graph_stop(); + + return ret; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->trace_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->trace_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out_no_start; + } + + /* reset the max latency */ + tr->max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs/preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->trace_buffer, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&tr->max_buffer, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tr->max_latency = 0; + tracing_start(); + trace->start(tr); + + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->trace_buffer, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&tr->max_buffer, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + +out: + tracing_start(); +out_no_start: + trace->reset(tr); + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + +#ifdef CONFIG_SCHED_TRACER + +struct wakeup_test_data { + struct completion is_ready; + int go; +}; + +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a -deadline thread */ + static const struct sched_attr attr = { +#ifdef CONFIG_SCHED_BFS + /* No deadline on BFS, use RR */ + .sched_policy = SCHED_RR, +#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL +#endif + }; + struct wakeup_test_data *x = data; + + sched_setattr(current, &attr); + + /* Make it know we have a new prio */ + complete(&x->is_ready); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + while (!x->go) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + complete(&x->is_ready); + + set_current_state(TASK_INTERRUPTIBLE); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + + return 0; +} +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + struct task_struct *p; + struct wakeup_test_data data; + unsigned long count; + int ret; + + memset(&data, 0, sizeof(data)); + + init_completion(&data.is_ready); + + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&data.is_ready); + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + + while (p->on_rq) { + /* + * Sleep to make sure the -deadline thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } + + init_completion(&data.is_ready); + + data.go = 1; + /* memory barrier is in the wake_up_process() */ + + wake_up_process(p); + + /* Wait for the task to wake up */ + wait_for_completion(&data.is_ready); + + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->trace_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + + + trace->reset(tr); + tracing_start(); + + tr->max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->trace_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ + diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 000000000..b4c475a0a --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,13 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 000000000..e694c9f9e --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,377 @@ +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + * + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) + +/* How much buffer is written? */ +#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) + +/* + * trace_seq should work with being initialized with 0s. + */ +static inline void __trace_seq_init(struct trace_seq *s) +{ + if (unlikely(!s->seq.size)) + trace_seq_init(s); +} + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int ret; + + __trace_seq_init(s); + + ret = seq_buf_print_seq(m, &s->seq); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. This lets the caller try again or + * do something else with the contents. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + unsigned int save_len = s->seq.len; + va_list ap; + + if (s->full) + return; + + __trace_seq_init(s); + + va_start(ap, fmt); + seq_buf_vprintf(&s->seq, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + */ +void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_vprintf(&s->seq, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + */ +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_bprintf(&s->seq, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_puts(struct trace_seq *s, const char *str) +{ + unsigned int len = strlen(str); + + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, str, len); +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple charater + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return; + } + + seq_buf_putc(&s->seq, c); +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + */ +void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, mem, len); +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + */ +void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + /* Each byte is represented by two chars */ + if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + /* The added spaces can still cause an overflow */ + seq_buf_putmem_hex(&s->seq, mem, len); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_path(&s->seq, path, "\n"); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the squence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequenc (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ + __trace_seq_init(s); + return seq_buf_to_user(&s->seq, ubuf, cnt); +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 000000000..3f3449624 --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,484 @@ +/* + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/stacktrace.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/init.h> + +#include <asm/setup.h> + +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +#ifdef CC_USING_FENTRY +# define fentry 1 +#else +# define fentry 0 +#endif + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = + { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */ +static struct stack_trace max_stack_trace = { + .max_entries = STACK_TRACE_ENTRIES - 1, + .entries = &stack_dump_trace[1], +}; + +static unsigned long max_stack_size; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static DEFINE_PER_CPU(int, trace_active); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; +static int last_stack_tracer_enabled; + +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + +static inline void +check_stack(unsigned long ip, unsigned long *stack) +{ + unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = ACCESS_ONCE(tracer_frame); + int i; + + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; + + if (this_size <= max_stack_size) + return; + + /* we do not handle interrupt stacks yet */ + if (!object_is_on_stack(stack)) + return; + + local_irq_save(flags); + arch_spin_lock(&max_stack_lock); + + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + + /* a race could have already updated it */ + if (this_size <= max_stack_size) + goto out; + + max_stack_size = this_size; + + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; + + save_stack_trace(&max_stack_trace); + + /* + * Add the passed in ip from the function tracer. + * Searching for this on the stack will skip over + * most of the overhead from the stack tracer itself. + */ + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; + + /* + * Now find where in the stack these are. + */ + i = 0; + start = stack; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < max_stack_trace.nr_entries) { + int found = 0; + + stack_dump_index[i] = this_size; + p = start; + + for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (*p == stack_dump_trace[i]) { + this_size = stack_dump_index[i++] = + (top - p) * sizeof(unsigned long); + found = 1; + /* Start the search from here */ + start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame) && i == 1) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + max_stack_size -= tracer_frame; + } + } + } + + if (!found) + i++; + } + + if (task_stack_end_corrupted(current)) { + print_max_stack(); + BUG(); + } + + out: + arch_spin_unlock(&max_stack_lock); + local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) +{ + unsigned long stack; + int cpu; + + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + /* no atomic needed, we only modify this variable by this cpu */ + if (per_cpu(trace_active, cpu)++ != 0) + goto out; + + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * The ip is pretty useless because the function tracer + * was called before that function set up its stack frame. + * In this case, we use the parent ip. + * + * By adding the return address of either the parent ip + * or the current ip we can disregard most of the stack usage + * caused by the stack tracer itself. + * + * The function tracer always reports the address of where the + * mcount call was, but the stack will hold the return address. + */ + if (fentry) + ip = parent_ip; + else + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); + + out: + per_cpu(trace_active, cpu)--; + /* prevent recursion in schedule */ + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + int ret; + int cpu; + + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) + return ret; + + local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); + *ptr = val; + arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; + local_irq_restore(flags); + + return count; +} + +static const struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, + .llseek = default_llseek, +}; + +static void * +__next(struct seq_file *m, loff_t *pos) +{ + long n = *pos - 1; + + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + return NULL; + + m->private = (void *)n; + return &m->private; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __next(m, pos); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + int cpu; + + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); + + if (*pos == 0) + return SEQ_START_TOKEN; + + return __next(m, pos); +} + +static void t_stop(struct seq_file *m, void *p) +{ + int cpu; + + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + + local_irq_enable(); +} + +static void trace_lookup_stack(struct seq_file *m, long i) +{ + unsigned long addr = stack_dump_trace[i]; + + seq_printf(m, "%pS\n", (void *)addr); +} + +static void print_disabled(struct seq_file *m) +{ + seq_puts(m, "#\n" + "# Stack tracer disabled\n" + "#\n" + "# To enable the stack tracer, either add 'stacktrace' to the\n" + "# kernel command line\n" + "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" + "#\n"); +} + +static int t_show(struct seq_file *m, void *v) +{ + long i; + int size; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + if (!stack_tracer_enabled && !max_stack_size) + print_disabled(m); + + return 0; + } + + i = *(long *)v; + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return 0; + + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + + trace_lookup_stack(m, i); + + return 0; +} + +static const struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &stack_trace_seq_ops); +} + +static const struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +int +stack_trace_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&stack_sysctl_mutex); + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || + (last_stack_tracer_enabled == !!stack_tracer_enabled)) + goto out; + + last_stack_tracer_enabled = !!stack_tracer_enabled; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + +static __init int enable_stacktrace(char *str) +{ + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + + stack_tracer_enabled = 1; + last_stack_tracer_enabled = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + +static __init int stack_trace_init(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); + + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); + + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c new file mode 100644 index 000000000..6cf935316 --- /dev/null +++ b/kernel/trace/trace_stat.c @@ -0,0 +1,359 @@ +/* + * Infrastructure for statistic tracing (histogram output). + * + * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Based on the code from trace_branch.c which is + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ + + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/tracefs.h> +#include "trace_stat.h" +#include "trace.h" + + +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; + void *stat; +}; + +/* A stat session is the stats output in one file */ +struct stat_session { + struct list_head session_list; + struct tracer_stat *ts; + struct rb_root stat_root; + struct mutex stat_mutex; + struct dentry *file; +}; + +/* All of the sessions currently in use. Each stat file embed one session */ +static LIST_HEAD(all_stat_sessions); +static DEFINE_MUTEX(all_stat_sessions_mutex); + +/* The root directory for all stat files */ +static struct dentry *stat_dir; + +static void __reset_stat_session(struct stat_session *session) +{ + struct stat_node *snode, *n; + + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); + kfree(snode); + } + + session->stat_root = RB_ROOT; +} + +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + +static void destroy_session(struct stat_session *session) +{ + tracefs_remove(session->file); + __reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); +} + +typedef int (*cmp_stat_t)(void *, void *); + +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + return 0; +} + +/* + * For tracers that don't provide a stat_cmp callback. + * This one will force an insertion as right-most node + * in the rbtree. + */ +static int dummy_cmp(void *p1, void *p2) +{ + return -1; +} + +/* + * Initialize the stat rbtree at each trace_stat file opening. + * All of these copies and sorting are required on all opening + * since the stats could have changed between two file sessions. + */ +static int stat_seq_init(struct stat_session *session) +{ + struct tracer_stat *ts = session->ts; + struct rb_root *root = &session->stat_root; + void *stat; + int ret = 0; + int i; + + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + + if (!ts->stat_cmp) + ts->stat_cmp = dummy_cmp; + + stat = ts->stat_start(ts); + if (!stat) + goto exit; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit; + + /* + * Iterate over the tracer stat entries and store them in an rbtree. + */ + for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; + } + +exit: + mutex_unlock(&session->stat_mutex); + return ret; + +exit_free_rbtree: + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + return ret; +} + + +static void *stat_seq_start(struct seq_file *s, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node; + int n = *pos; + int i; + + /* Prevent from tracer switch or rbtree modification */ + mutex_lock(&session->stat_mutex); + + /* If we are in the beginning of the file, print the headers */ + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } + + node = rb_first(&session->stat_root); + for (i = 0; node && i < n; i++) + node = rb_next(node); + + return node; +} + +static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; + + if (p == SEQ_START_TOKEN) + return rb_first(&session->stat_root); + + return rb_next(node); +} + +static void stat_seq_stop(struct seq_file *s, void *p) +{ + struct stat_session *session = s->private; + mutex_unlock(&session->stat_mutex); +} + +static int stat_seq_show(struct seq_file *s, void *v) +{ + struct stat_session *session = s->private; + struct stat_node *l = container_of(v, struct stat_node, node); + + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + + return session->ts->stat_show(s, l->stat); +} + +static const struct seq_operations trace_stat_seq_ops = { + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show +}; + +/* The session stat is refilled and resorted at each stat file opening */ +static int tracing_stat_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct stat_session *session = inode->i_private; + + ret = stat_seq_init(session); + if (ret) + return ret; + + ret = seq_open(file, &trace_stat_seq_ops); + if (ret) { + reset_stat_session(session); + return ret; + } + + m = file->private_data; + m->private = session; + return ret; +} + +/* + * Avoid consuming memory with our now useless rbtree. + */ +static int tracing_stat_release(struct inode *i, struct file *f) +{ + struct stat_session *session = i->i_private; + + reset_stat_session(session); + + return seq_release(i, f); +} + +static const struct file_operations tracing_stat_fops = { + .open = tracing_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_stat_release +}; + +static int tracing_stat_init(void) +{ + struct dentry *d_tracing; + + d_tracing = tracing_init_dentry(); + if (IS_ERR(d_tracing)) + return 0; + + stat_dir = tracefs_create_dir("trace_stat", d_tracing); + if (!stat_dir) + pr_warning("Could not create tracefs " + "'trace_stat' entry\n"); + return 0; +} + +static int init_stat_file(struct stat_session *session) +{ + if (!stat_dir && tracing_stat_init()) + return -ENODEV; + + session->file = tracefs_create_file(session->ts->name, 0644, + stat_dir, + session, &tracing_stat_fops); + if (!session->file) + return -ENOMEM; + return 0; +} + +int register_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *session, *node; + int ret; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; + + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry(node, &all_stat_sessions, session_list) { + if (node->ts == trace) { + mutex_unlock(&all_stat_sessions_mutex); + return -EINVAL; + } + } + mutex_unlock(&all_stat_sessions_mutex); + + /* Init the session */ + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + mutex_init(&session->stat_mutex); + + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + return ret; + } + + /* Register */ + mutex_lock(&all_stat_sessions_mutex); + list_add_tail(&session->session_list, &all_stat_sessions); + mutex_unlock(&all_stat_sessions_mutex); + + return 0; +} + +void unregister_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *node, *tmp; + + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } + } + mutex_unlock(&all_stat_sessions_mutex); +} diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h new file mode 100644 index 000000000..8f03914b9 --- /dev/null +++ b/kernel/trace/trace_stat.h @@ -0,0 +1,33 @@ +#ifndef __TRACE_STAT_H +#define __TRACE_STAT_H + +#include <linux/seq_file.h> + +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(struct tracer_stat *trace); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for stats sorting */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + +/* + * Destroy or create a stat file + */ +extern int register_stat_tracer(struct tracer_stat *trace); +extern void unregister_stat_tracer(struct tracer_stat *trace); + +#endif /* __TRACE_STAT_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c new file mode 100644 index 000000000..f97f6e3a6 --- /dev/null +++ b/kernel/trace/trace_syscalls.c @@ -0,0 +1,750 @@ +#include <trace/syscall.h> +#include <trace/events/syscalls.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ +#include <linux/ftrace.h> +#include <linux/perf_event.h> +#include <asm/syscall.h> + +#include "trace_output.h" +#include "trace.h" + +static DEFINE_MUTEX(syscall_trace_lock); + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type, void *data); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type, void *data); + +static struct list_head * +syscall_get_enter_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata **start; + struct syscall_metadata **stop; + char str[KSYM_SYMBOL_LEN]; + + + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + + for ( ; start < stop; start++) { + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) + return *start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +static enum print_line_t +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_enter *trace; + struct syscall_metadata *entry; + int i, syscall; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) + goto end; + + if (entry->enter_event->event.type != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + + trace_seq_printf(s, "%s(", entry->name); + + for (i = 0; i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + /* parameter types */ + if (trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", entry->types[i]); + + /* parameter values */ + trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_exit *trace; + int syscall; + struct syscall_metadata *entry; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) { + trace_seq_putc(s, '\n'); + goto out; + } + + if (entry->exit_event->event.type != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace->ret); + + out: + return trace_handle_return(s); +} + +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) + +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int i; + int pos = 0; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int __init set_syscall_print_fmt(struct ftrace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; + + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } + + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void __init free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); +} + +static int __init syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta = call->data; + int ret; + int i; + int offset = offsetof(typeof(trace), args); + + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0, + FILTER_OTHER); + offset += sizeof(unsigned long); + } + + return ret; +} + +static int __init syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; + + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + FILTER_OTHER); + + return ret; +} + +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) +{ + struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; + struct syscall_trace_enter *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; + int syscall_nr; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + + local_save_flags(irq_flags); + pc = preempt_count(); + + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->enter_event->event.type, size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); +} + +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) +{ + struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; + struct syscall_trace_exit *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; + int syscall_nr; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + local_save_flags(irq_flags); + pc = preempt_count(); + + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->exit_event->event.type, sizeof(*entry), + irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + entry->ret = syscall_get_return_value(current, regs); + + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); +} + +static int reg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) +{ + struct trace_array *tr = file->tr; + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); + if (!ret) { + rcu_assign_pointer(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void unreg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) +{ + struct trace_array *tr = file->tr; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + tr->sys_refcount_enter--; + RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); + mutex_unlock(&syscall_trace_lock); +} + +static int reg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) +{ + struct trace_array *tr = file->tr; + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); + if (!ret) { + rcu_assign_pointer(tr->exit_syscall_files[num], file); + tr->sys_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void unreg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) +{ + struct trace_array *tr = file->tr; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + tr->sys_refcount_exit--; + RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); + mutex_unlock(&syscall_trace_lock); +} + +static int __init init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } + + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class __refdata event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class __refdata event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + +unsigned long __init __weak arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + +void __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } +} + +#ifdef CONFIG_PERF_EVENTS + +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; + +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, + sys_data->enter_event->event.type, NULL, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); +} + +static int perf_sysenter_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void perf_sysenter_disable(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_exit *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) + return; + + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, + sys_data->exit_event->event.type, NULL, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); +} + +static int perf_sysexit_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void perf_sysexit_disable(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +#endif /* CONFIG_PERF_EVENTS */ + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) +{ + struct ftrace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(file, event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(file, event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) +{ + struct ftrace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(file, event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(file, event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 000000000..6dd022c7b --- /dev/null +++ b/kernel/trace/trace_uprobe.c @@ -0,0 +1,1336 @@ +/* + * uprobes-based tracing events + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) IBM Corporation, 2010-2012 + * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/namei.h> +#include <linux/string.h> + +#include "trace_probe.h" + +#define UPROBE_EVENT_SYSTEM "uprobes" + +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + +/* + * uprobe event core functions + */ +struct trace_uprobe { + struct list_head list; + struct trace_uprobe_filter filter; + struct uprobe_consumer consumer; + struct inode *inode; + char *filename; + unsigned long offset; + unsigned long nhit; + struct trace_probe tp; +}; + +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, tp.args) + \ + (sizeof(struct probe_arg) * (n))) + +static int register_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); + +static DEFINE_MUTEX(uprobe_lock); +static LIST_HEAD(uprobe_list); + +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); + +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)get_user_stack_nth(regs, \ + ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ +{ \ + type retval; \ + void __user *vaddr = (void __force __user *) addr; \ + \ + if (copy_from_user(&retval, vaddr, sizeof(type))) \ + *(type *)dest = 0; \ + else \ + *(type *) dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + u32 rloc = *(u32 *)dest; + int maxlen = get_rloc_len(rloc); + u8 *dst = get_rloc_data(dest); + void __user *src = (void __force __user *) addr; + + if (!maxlen) + return; + + ret = strncpy_from_user(dst, src, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + } +} + +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type) \ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ + void *offset, void *dest)\ +{ \ + void *vaddr = (void *)translate_user_vaddr(offset); \ + \ + FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +static const struct fetch_type uprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + +/* + * Allocate new trace_uprobe and initialize it (including uprobes). + */ +static struct trace_uprobe * +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) +{ + struct trace_uprobe *tu; + + if (!event || !is_good_name(event)) + return ERR_PTR(-EINVAL); + + if (!group || !is_good_name(group)) + return ERR_PTR(-EINVAL); + + tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + if (!tu) + return ERR_PTR(-ENOMEM); + + tu->tp.call.class = &tu->tp.class; + tu->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tu->tp.call.name) + goto error; + + tu->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tu->tp.class.system) + goto error; + + INIT_LIST_HEAD(&tu->list); + INIT_LIST_HEAD(&tu->tp.files); + tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); + return tu; + +error: + kfree(tu->tp.call.name); + kfree(tu); + + return ERR_PTR(-ENOMEM); +} + +static void free_trace_uprobe(struct trace_uprobe *tu) +{ + int i; + + for (i = 0; i < tu->tp.nr_args; i++) + traceprobe_free_probe_arg(&tu->tp.args[i]); + + iput(tu->inode); + kfree(tu->tp.call.class->system); + kfree(tu->tp.call.name); + kfree(tu->filename); + kfree(tu); +} + +static struct trace_uprobe *find_probe_event(const char *event, const char *group) +{ + struct trace_uprobe *tu; + + list_for_each_entry(tu, &uprobe_list, list) + if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && + strcmp(tu->tp.call.class->system, group) == 0) + return tu; + + return NULL; +} + +/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +static int unregister_trace_uprobe(struct trace_uprobe *tu) +{ + int ret; + + ret = unregister_uprobe_event(tu); + if (ret) + return ret; + + list_del(&tu->list); + free_trace_uprobe(tu); + return 0; +} + +/* Register a trace_uprobe and probe_event */ +static int register_trace_uprobe(struct trace_uprobe *tu) +{ + struct trace_uprobe *old_tu; + int ret; + + mutex_lock(&uprobe_lock); + + /* register as an event */ + old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + tu->tp.call.class->system); + if (old_tu) { + /* delete old event */ + ret = unregister_trace_uprobe(old_tu); + if (ret) + goto end; + } + + ret = register_uprobe_event(tu); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + list_add_tail(&tu->list, &uprobe_list); + +end: + mutex_unlock(&uprobe_lock); + + return ret; +} + +/* + * Argument syntax: + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * + * - Remove uprobe: -:[GRP/]EVENT + */ +static int create_trace_uprobe(int argc, char **argv) +{ + struct trace_uprobe *tu; + struct inode *inode; + char *arg, *event, *group, *filename; + char buf[MAX_EVENT_NAME_LEN]; + struct path path; + unsigned long offset; + bool is_delete, is_return; + int i, ret; + + inode = NULL; + ret = 0; + is_delete = false; + is_return = false; + event = NULL; + group = NULL; + + /* argc must be >= 1 */ + if (argv[0][0] == '-') + is_delete = true; + else if (argv[0][0] == 'r') + is_return = true; + else if (argv[0][0] != 'p') { + pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + arg = strchr(event, '/'); + + if (arg) { + group = event; + event = arg + 1; + event[-1] = '\0'; + + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = UPROBE_EVENT_SYSTEM; + + if (is_delete) { + int ret; + + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&uprobe_lock); + tu = find_probe_event(event, group); + + if (!tu) { + mutex_unlock(&uprobe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + ret = unregister_trace_uprobe(tu); + mutex_unlock(&uprobe_lock); + return ret; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + pr_info("probe point must be have a filename.\n"); + return -EINVAL; + } + arg = strchr(argv[1], ':'); + if (!arg) { + ret = -EINVAL; + goto fail_address_parse; + } + + *arg++ = '\0'; + filename = argv[1]; + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_address_parse; + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } + + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + + argc -= 2; + argv += 2; + + /* setup a probe */ + if (!event) { + char *tail; + char *ptr; + + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { + ret = -ENOMEM; + goto fail_address_parse; + } + + ptr = strpbrk(tail, ".-_"); + if (ptr) + *ptr = '\0'; + + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); + event = buf; + kfree(tail); + } + + tu = alloc_trace_uprobe(group, event, argc, is_return); + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + ret = PTR_ERR(tu); + goto fail_address_parse; + } + tu->offset = offset; + tu->inode = inode; + tu->filename = kstrdup(filename, GFP_KERNEL); + + if (!tu->filename) { + pr_info("Failed to allocate filename.\n"); + ret = -ENOMEM; + goto error; + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + /* Increment count for freeing args in error case */ + tu->tp.nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + parg->name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + parg->name = kstrdup(buf, GFP_KERNEL); + } + + if (!parg->name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", i, parg->name); + ret = -EINVAL; + goto error; + } + + if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + is_return, false, + uprobes_fetch_type_table); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_uprobe(tu); + if (ret) + goto error; + return 0; + +error: + free_trace_uprobe(tu); + return ret; + +fail_address_parse: + iput(inode); + + pr_info("Failed to parse address or file.\n"); + + return ret; +} + +static int cleanup_all_probes(void) +{ + struct trace_uprobe *tu; + int ret = 0; + + mutex_lock(&uprobe_lock); + while (!list_empty(&uprobe_list)) { + tu = list_entry(uprobe_list.next, struct trace_uprobe, list); + ret = unregister_trace_uprobe(tu); + if (ret) + break; + } + mutex_unlock(&uprobe_lock); + return ret; +} + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&uprobe_lock); + return seq_list_start(&uprobe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &uprobe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&uprobe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + char c = is_ret_probe(tu) ? 'r' : 'p'; + int i; + + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, + ftrace_event_name(&tu->tp.call)); + seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); + + seq_putc(m, '\n'); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = cleanup_all_probes(); + if (ret) + return ret; + } + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); +} + +static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + ftrace_event_name(&tu->tp.call), tu->nhit); + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations uprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + int cpu; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + for_each_possible_cpu(cpu) + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, + cpu)->buf); + + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + +static void __uprobe_trace_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize, + struct ftrace_event_file *ftrace_file) +{ + struct uprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + void *data; + int size, esize; + struct ftrace_event_call *call = &tu->tp.call; + + WARN_ON(call != ftrace_file->event_call); + + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, size, 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); +} + +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + + return 0; +} + +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); +} + +/* Event entry printers */ +static enum print_line_t +print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) +{ + struct uprobe_trace_entry_head *entry; + struct trace_seq *s = &iter->seq; + struct trace_uprobe *tu; + u8 *data; + int i; + + entry = (struct uprobe_trace_entry_head *)iter->ent; + tu = container_of(event, struct trace_uprobe, tp.call.event); + + if (is_ret_probe(tu)) { + trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[1], entry->vaddr[0]); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[0]); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + if (!parg->type->print(s, parg->name, data + parg->offset, entry)) + goto out; + } + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, + filter_func_t filter) +{ + bool enabled = trace_probe_is_enabled(&tu->tp); + struct event_file_link *link = NULL; + int ret; + + if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + list_add_tail_rcu(&link->list, &tu->tp.files); + + tu->tp.flags |= TP_FLAG_TRACE; + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + + tu->tp.flags |= TP_FLAG_PROFILE; + } + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + if (enabled) + return 0; + + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + + tu->consumer.filter = filter; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) + goto err_buffer; + + return 0; + + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else { + tu->tp.flags &= ~TP_FLAG_PROFILE; + } + return ret; +} + +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) +{ + if (!trace_probe_is_enabled(&tu->tp)) + return; + + if (file) { + struct event_file_link *link; + + link = find_event_file_link(&tu->tp, file); + if (!link) + return; + + list_del_rcu(&link->list); + /* synchronize with u{,ret}probe_trace_func */ + synchronize_sched(); + kfree(link); + + if (!list_empty(&tu->tp.files)) + return; + } + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; + + uprobe_buffer_disable(); +} + +static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i, size; + struct uprobe_trace_entry_head field; + struct trace_uprobe *tu = event_call->data; + + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } + /* Set argument names as fields */ + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, size + parg->offset, + parg->type->size, parg->type->is_signed, + FILTER_OTHER); + + if (ret) + return ret; + } + return 0; +} + +#ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.target->mm == mm) + return true; + } + + return false; +} + +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + + write_lock(&tu->filter.rwlock); + if (event->hw.target) { + list_del(&event->hw.tp_list); + done = tu->filter.nr_systemwide || + (event->hw.target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { + tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } + write_unlock(&tu->filter.rwlock); + + if (!done) + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + + return 0; +} + +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + int err; + + write_lock(&tu->filter.rwlock); + if (event->hw.target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); + } else { + done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; + } + write_unlock(&tu->filter.rwlock); + + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + +static void __uprobe_perf_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct ftrace_event_call *call = &tu->tp.call; + struct uprobe_trace_entry_head *entry; + struct hlist_head *head; + void *data; + int size, esize; + int rctx; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; + + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + if (!entry) + goto out; + + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } + + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + out: + preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + if (!is_ret_probe(tu)) + __uprobe_perf_func(tu, 0, regs, ucb, dsize); + return 0; +} + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + __uprobe_perf_func(tu, func, regs, ucb, dsize); +} +#endif /* CONFIG_PERF_EVENTS */ + +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, + void *data) +{ + struct trace_uprobe *tu = event->data; + struct ftrace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(tu, file, NULL); + + case TRACE_REG_UNREGISTER: + probe_event_disable(tu, file); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_event_enable(tu, NULL, uprobe_perf_filter); + + case TRACE_REG_PERF_UNREGISTER: + probe_event_disable(tu, NULL); + return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + +#endif + default: + return 0; + } + return 0; +} + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + int ret = 0; + + + tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; + + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + ret |= uprobe_trace_func(tu, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (tu->tp.flags & TP_FLAG_PROFILE) + ret |= uprobe_perf_func(tu, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return ret; +} + +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + tu = container_of(con, struct trace_uprobe, consumer); + + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (tu->tp.flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return 0; +} + +static struct trace_event_functions uprobe_funcs = { + .trace = print_uprobe_event +}; + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct ftrace_event_call *call = &tu->tp.call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &uprobe_funcs; + call->class->define_fields = uprobe_event_define_fields; + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + return -ENOMEM; + + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + + call->class->reg = trace_uprobe_register; + call->data = tu; + ret = trace_add_event_call(call); + + if (ret) { + pr_info("Failed to register uprobe event: %s\n", + ftrace_event_name(call)); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + + return ret; +} + +static int unregister_uprobe_event(struct trace_uprobe *tu) +{ + int ret; + + /* tu->event is unregistered in trace_remove_event_call() */ + ret = trace_remove_event_call(&tu->tp.call); + if (ret) + return ret; + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + return 0; +} + +/* Make a trace interface for controling probe points */ +static __init int init_uprobe_trace(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + trace_create_file("uprobe_events", 0644, d_tracer, + NULL, &uprobe_events_ops); + /* Profile interface */ + trace_create_file("uprobe_profile", 0444, d_tracer, + NULL, &uprobe_profile_ops); + return 0; +} + +fs_initcall(init_uprobe_trace); |