diff options
author | John Rigby <john.rigby@linaro.org> | 2011-03-09 17:24:48 -0700 |
---|---|---|
committer | John Rigby <john.rigby@linaro.org> | 2011-03-09 17:24:48 -0700 |
commit | 0ffb81261428df3e9f4d32ed00a6bd385299bfcf (patch) | |
tree | 06bbc160842f90e0243e1b29dacd13242564e380 | |
parent | 64b883c9d077d0d0f70685ab67a88dbcd1a72720 (diff) | |
parent | 01355eaa8f6aadbf113d9d75c59accc60c274144 (diff) |
Merge remote branch 'aviksil-lttng/linaro' into linux-linaro-2.6.38-alt
256 files changed, 7891 insertions, 443 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f4a04c0c7edc..d46f541baf83 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -794,6 +794,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that can be changed at run time by the set_graph_function file in the debugfs tracing directory. + force_tsc_sync=1 + Force TSC resynchronization when SMP CPUs go online. + See also idle=poll and disable frequency scaling. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/Documentation/markers.txt b/Documentation/markers.txt new file mode 100644 index 000000000000..e25df7cd03b2 --- /dev/null +++ b/Documentation/markers.txt @@ -0,0 +1,113 @@ + Using the Linux Kernel Markers + + Mathieu Desnoyers + + +This document introduces Linux Kernel Markers and their use. It provides +examples of how to insert markers in the kernel and connect probe functions to +them and provides some examples of probe functions. + + +* Purpose of markers + +A marker placed in code provides a hook to call a function (probe) that you can +provide at runtime. A marker can be "on" (a probe is connected to it) or "off" +(no probe is attached). When a marker is "off" it has no effect, except for +adding a tiny time penalty (checking a condition for a branch) and space +penalty (adding a few bytes for the function call at the end of the +instrumented function and adds a data structure in a separate section). The +immediate values are used to minimize the impact on data cache, encoding the +condition in the instruction stream. When a marker is "on", the function you +provide is called each time the marker is executed, in the execution context of +the caller. When the function provided ends its execution, it returns to the +caller (continuing from the marker site). + +You can put markers at important locations in the code. Markers are +lightweight hooks that can pass an arbitrary number of parameters, +described in a printk-like format string, to the attached probe function. + +They can be used for tracing and performance accounting. + + +* Usage + +In order to use the macro trace_mark, you should include linux/marker.h. + +#include <linux/marker.h> + +And, + +trace_mark(subsystem_event, "myint %d mystring %s", someint, somestring); +Where : +- subsystem_event is an identifier unique to your event + - subsystem is the name of your subsystem. + - event is the name of the event to mark. +- "myint %d mystring %s" is the formatted string for the serializer. "myint" and + "mystring" are repectively the field names associated with the first and + second parameter. +- someint is an integer. +- somestring is a char pointer. + +Connecting a function (probe) to a marker is done by providing a probe (function +to call) for the specific marker through marker_probe_register() and can be +activated by calling marker_arm(). Marker deactivation can be done by calling +marker_disarm() as many times as marker_arm() has been called. Removing a probe +is done through marker_probe_unregister(); it will disarm the probe. + +marker_synchronize_unregister() must be called between probe unregistration and +the first occurrence of +- the end of module exit function, + to make sure there is no caller left using the probe; +- the free of any resource used by the probes, + to make sure the probes wont be accessing invalid data. +This, and the fact that preemption is disabled around the probe call, make sure +that probe removal and module unload are safe. See the "Probe example" section +below for a sample probe module. + +The marker mechanism supports inserting multiple instances of the same marker. +Markers can be put in inline functions, inlined static functions, and +unrolled loops as well as regular functions. + +The naming scheme "subsystem_event" is suggested here as a convention intended +to limit collisions. Marker names are global to the kernel: they are considered +as being the same whether they are in the core kernel image or in modules. +Conflicting format strings for markers with the same name will cause the markers +to be detected to have a different format string not to be armed and will output +a printk warning which identifies the inconsistency: + +"Format mismatch for probe probe_name (format), marker (format)" + +Another way to use markers is to simply define the marker without generating any +function call to actually call into the marker. This is useful in combination +with tracepoint probes in a scheme like this : + +void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk); + +DEFINE_MARKER_TP(marker_channel, marker_eventname, tracepoint_name, + probe_tracepoint_name, "arg1 %u pid %d"); + +notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk) +{ + struct marker *marker = &GET_MARKER(marker_channel, marker_eventname); + /* write data to trace buffers ... */ +} + +* Optimization for a given architecture + +To force use of a non-optimized version of the markers, _trace_mark() should be +used. It takes the same parameters as the normal markers, but it does not use +the immediate values based on code patching. + + +* Probe / marker example + +See the example provided in samples/markers/src + +Compile them with your kernel. + +Run, as root : +modprobe marker-example (insmod order is not important) +modprobe probe-example +cat /proc/marker-example (returns an expected error) +rmmod marker-example probe-example +dmesg diff --git a/Documentation/trace/tracepoints.txt b/Documentation/trace/tracepoints.txt index c0e1ceed75a4..d380250339a0 100644 --- a/Documentation/trace/tracepoints.txt +++ b/Documentation/trace/tracepoints.txt @@ -106,7 +106,7 @@ used to export the defined tracepoints. See the example provided in samples/tracepoints Compile them with your kernel. They are built during 'make' (not -'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m. +'make modules') when CONFIG_SAMPLE=y and CONFIG_SAMPLE_TRACEPOINTS=m. Run, as root : modprobe tracepoint-sample (insmod order is not important) diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 6f32f9c84a2d..1ba67b145781 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -56,7 +56,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (2*PAGE_SIZE) -#define PREEMPT_ACTIVE 0x40000000 +#define PREEMPT_ACTIVE 0x10000000 /* * Thread information flags: @@ -79,6 +79,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_UAC_SIGBUS 12 #define TIF_MEMDIE 13 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 14 /* restore signal mask in do_signal */ +#define TIF_KERNEL_TRACE 15 /* Kernel tracing of syscalls */ #define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -87,6 +88,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_FREEZE (1<<TIF_FREEZE) /* Work to do on interrupt/exception return. */ @@ -95,7 +97,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); /* Work to do on any return to userspace. */ #define _TIF_ALLWORK_MASK (_TIF_WORK_MASK \ - | _TIF_SYSCALL_TRACE) + | _TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE) #define ALPHA_UAC_SHIFT 10 #define ALPHA_UAC_MASK (1 << TIF_UAC_NOPRINT | 1 << TIF_UAC_NOFIX | \ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a53245a9ef5e..d8cdd7b00acd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -13,6 +13,7 @@ config ARM select HAVE_KPROBES if (!XIP_KERNEL && !THUMB2_KERNEL) select HAVE_KRETPROBES if (HAVE_KPROBES) select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) + select HAVE_LTT_DUMP_TABLES select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL) select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 494224a9b459..23f8c4b764e2 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -147,4 +147,8 @@ config DEBUG_S3C_UART The uncompressor code port configuration is now handled by CONFIG_S3C_LOWLEVEL_UART_PORT. +config DEBUG_TRACE_CLOCK + bool "Debug trace clock" + depends on HAVE_TRACE_CLOCK + endmenu diff --git a/arch/arm/include/asm/a.out-core.h b/arch/arm/include/asm/a.out-core.h index 93d04acaa31f..92f10cb5c70c 100644 --- a/arch/arm/include/asm/a.out-core.h +++ b/arch/arm/include/asm/a.out-core.h @@ -32,11 +32,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) dump->u_dsize = (tsk->mm->brk - tsk->mm->start_data + PAGE_SIZE - 1) >> PAGE_SHIFT; dump->u_ssize = 0; - dump->u_debugreg[0] = tsk->thread.debug.bp[0].address; - dump->u_debugreg[1] = tsk->thread.debug.bp[1].address; - dump->u_debugreg[2] = tsk->thread.debug.bp[0].insn.arm; - dump->u_debugreg[3] = tsk->thread.debug.bp[1].insn.arm; - dump->u_debugreg[4] = tsk->thread.debug.nsaved; + memset(dump->u_debugreg, 0, sizeof(dump->u_debugreg)); if (dump->start_stack < 0x04000000) dump->u_ssize = (0x04000000 - dump->start_stack) >> PAGE_SHIFT; diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 9a87823642d0..cf7dc925c635 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -395,7 +395,7 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, break; case 2: do { - asm volatile("@ __cmpxchg1\n" + asm volatile("@ __cmpxchg2\n" " ldrexh %1, [%2]\n" " mov %0, #0\n" " teq %1, %3\n" diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 7b5cc8dae06e..1f925b8bcd5f 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -129,6 +129,7 @@ extern void vfp_flush_hwstate(struct thread_info *); /* * thread information flags: * TIF_SYSCALL_TRACE - syscall trace active + * TIF_KERNEL_TRACE - kernel trace active * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user @@ -138,6 +139,7 @@ extern void vfp_flush_hwstate(struct thread_info *); #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ +#define TIF_KERNEL_TRACE 7 #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 @@ -149,6 +151,7 @@ extern void vfp_flush_hwstate(struct thread_info *); #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) diff --git a/arch/arm/include/asm/trace-clock.h b/arch/arm/include/asm/trace-clock.h new file mode 100644 index 000000000000..8a13b7dedde5 --- /dev/null +++ b/arch/arm/include/asm/trace-clock.h @@ -0,0 +1 @@ +#include <plat/trace-clock.h> diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index c891eb76c0e3..92684d2e9054 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -397,6 +397,8 @@ #define __NR_fanotify_mark (__NR_SYSCALL_BASE+368) #define __NR_prlimit64 (__NR_SYSCALL_BASE+369) +#define __NR_syscall_max 370 + /* * The following SWIs are ARM private. */ diff --git a/arch/arm/include/asm/user.h b/arch/arm/include/asm/user.h index 05ac4b06876a..35917b3a97f9 100644 --- a/arch/arm/include/asm/user.h +++ b/arch/arm/include/asm/user.h @@ -71,7 +71,7 @@ struct user{ /* the registers. */ unsigned long magic; /* To uniquely identify a core file */ char u_comm[32]; /* User command that was responsible */ - int u_debugreg[8]; + int u_debugreg[8]; /* No longer used */ struct user_fp u_fp; /* FP state */ struct user_fp_struct * u_fp0;/* Used by gdb to help find the values for */ /* the FP registers. */ diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 1e7b04a40a31..1edf1deadf85 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -43,6 +43,8 @@ ret_fast_syscall: * Ok, we need to do extra processing, enter the slow path. */ fast_work_pending: + tst r1, #_TIF_KERNEL_TRACE @ flag can be set asynchronously + bne __sys_trace_return str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: tst r1, #_TIF_NEED_RESCHED @@ -85,8 +87,8 @@ ENTRY(ret_from_fork) get_thread_info tsk ldr r1, [tsk, #TI_FLAGS] @ check for syscall tracing mov why, #1 - tst r1, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? - beq ret_slow_syscall + tst r1, #_TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE + beq ret_slow_syscall @ are we tracing syscalls? mov r1, sp mov r0, #1 @ trace exit [IP = 1] bl syscall_trace @@ -441,8 +443,8 @@ ENTRY(vector_swi) 1: #endif - tst r10, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? - bne __sys_trace + tst r10, #_TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE + bne __sys_trace @ are we tracing syscalls? cmp scno, #NR_syscalls @ check upper syscall limit adr lr, BSYM(ret_fast_syscall) @ return address diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 94bbedbed639..fe2277c5d8cd 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -30,6 +30,7 @@ #include <linux/uaccess.h> #include <linux/random.h> #include <linux/hw_breakpoint.h> +#include <trace/sched.h> #include <asm/cacheflush.h> #include <asm/leds.h> @@ -45,6 +46,8 @@ unsigned long __stack_chk_guard __read_mostly; EXPORT_SYMBOL(__stack_chk_guard); #endif +DEFINE_TRACE(sched_kthread_create); + static const char *processor_modes[] = { "USER_26", "FIQ_26" , "IRQ_26" , "SVC_26" , "UK4_26" , "UK5_26" , "UK6_26" , "UK7_26" , "UK8_26" , "UK9_26" , "UK10_26", "UK11_26", "UK12_26", "UK13_26", "UK14_26", "UK15_26", @@ -442,6 +445,7 @@ asm( ".pushsection .text\n" pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; + long pid; memset(®s, 0, sizeof(regs)); @@ -452,7 +456,10 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.ARM_pc = (unsigned long)kernel_thread_helper; regs.ARM_cpsr = regs.ARM_r7 | PSR_I_BIT; - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + pid = do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + + trace_sched_kthread_create(fn, pid); + return pid; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 2bf27f364d09..03438e9cc069 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -21,10 +21,15 @@ #include <linux/uaccess.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <linux/kallsyms.h> +#include <trace/syscall.h> #include <asm/pgtable.h> #include <asm/system.h> #include <asm/traps.h> +#include <asm/unistd.h> #define REG_PC 15 #define REG_PSR 16 @@ -52,6 +57,30 @@ #define BREAKINST_THUMB 0xde01 #endif +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + +extern unsigned long sys_call_table[]; + +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < __NR_syscall_max + 1; i++) { + sprint_symbol(namebuf, sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i, (void*)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); + +void ltt_dump_idt_table(void *call_data) +{ +} +EXPORT_SYMBOL_GPL(ltt_dump_idt_table); + struct pt_regs_offset { const char *name; int offset; @@ -788,6 +817,11 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) { unsigned long ip; + if (!why) + trace_syscall_entry(regs, scno); + else + trace_syscall_exit(regs->ARM_r0); + if (!test_thread_flag(TIF_SYSCALL_TRACE)) return scno; if (!(current->ptrace & PT_PTRACED)) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 21ac43f1c2d0..41eb77da882a 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -23,6 +23,7 @@ #include <linux/kexec.h> #include <linux/delay.h> #include <linux/init.h> +#include <trace/trap.h> #include <linux/sched.h> #include <asm/atomic.h> @@ -35,6 +36,9 @@ #include "signal.h" +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; void *vectors_page; @@ -296,7 +300,11 @@ void arm_notify_die(const char *str, struct pt_regs *regs, current->thread.error_code = err; current->thread.trap_no = trap; + trace_trap_entry(regs, current->thread.trap_no); + force_sig_info(info->si_signo, info, current); + + trace_trap_exit(); } else { die(str, regs, err); } diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 898fffe0e9cb..1d6f14a584fb 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -180,6 +180,7 @@ obj-$(CONFIG_MACH_OMAP_3430SDP) += board-3430sdp.o \ hsmmc.o \ board-flash.o obj-$(CONFIG_MACH_NOKIA_N8X0) += board-n8x0.o +obj-$(CONFIG_HAVE_TRACE_CLOCK) += trace-clock.o obj-$(CONFIG_MACH_NOKIA_RM680) += board-rm680.o \ sdram-nokia.o \ hsmmc.o diff --git a/arch/arm/mach-omap2/clkt34xx_dpll3m2.c b/arch/arm/mach-omap2/clkt34xx_dpll3m2.c index b2b1e37bb6bb..b10d9efd6db0 100644 --- a/arch/arm/mach-omap2/clkt34xx_dpll3m2.c +++ b/arch/arm/mach-omap2/clkt34xx_dpll3m2.c @@ -24,6 +24,7 @@ #include <plat/clock.h> #include <plat/sram.h> #include <plat/sdrc.h> +#include <asm/trace-clock.h> #include "clock.h" #include "clock3xxx.h" @@ -79,6 +80,8 @@ int omap3_core_dpll_m2_set_rate(struct clk *clk, unsigned long rate) unlock_dll = 1; } + cpu_hz = arm_fck_p->rate; + /* * XXX This only needs to be done when the CPU frequency changes */ diff --git a/arch/arm/mach-omap2/clock34xx.c b/arch/arm/mach-omap2/clock34xx.c index 287abc480924..8971015538ab 100644 --- a/arch/arm/mach-omap2/clock34xx.c +++ b/arch/arm/mach-omap2/clock34xx.c @@ -18,6 +18,7 @@ #undef DEBUG #include <linux/kernel.h> +#include <linux/module.h> #include <linux/clk.h> #include <linux/io.h> @@ -94,6 +95,9 @@ const struct clkops clkops_omap3430es2_dss_usbhost_wait = { .find_companion = omap2_clk_dflt_find_companion, }; +unsigned long long cpu_hz; +EXPORT_SYMBOL(cpu_hz); + /** * omap3430es2_clk_hsotgusb_find_idlest - return CM_IDLEST info for HSOTGUSB * @clk: struct clk * being enabled diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c index 2f864e4b085d..dcb1dd36c24c 100644 --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -29,6 +29,7 @@ #include <linux/delay.h> #include <linux/slab.h> #include <linux/console.h> +#include <trace/pm.h> #include <plat/sram.h> #include "clockdomain.h" @@ -41,6 +42,8 @@ #include <asm/tlbflush.h> +#include <asm/trace-clock.h> + #include "cm2xxx_3xxx.h" #include "cm-regbits-34xx.h" #include "prm-regbits-34xx.h" @@ -80,6 +83,11 @@ struct power_state { struct list_head node; }; +DEFINE_TRACE(pm_idle_entry); +DEFINE_TRACE(pm_idle_exit); +DEFINE_TRACE(pm_suspend_entry); +DEFINE_TRACE(pm_suspend_exit); + static LIST_HEAD(pwrst_list); static void (*_omap_sram_idle)(u32 *addr, int save_state); @@ -519,8 +527,23 @@ static void omap3_pm_idle(void) if (omap_irq_pending() || need_resched()) goto out; + trace_pm_idle_entry(); + save_sync_trace_clock(); + omap_sram_idle(); + /* + * Resyncing the trace clock should ideally be done much sooner. When + * we arrive here, there are already some interrupt handlers which have + * run before us, using potentially wrong timestamps. This leads + * to problems when restarting the clock (and synchronizing on the 32k + * clock) if the cycle counter was still active. + * resync_track_clock must ensure that timestamps never ever go + * backward. + */ + resync_trace_clock(); + trace_pm_idle_exit(); + out: local_fiq_enable(); local_irq_enable(); @@ -550,7 +573,11 @@ static int omap3_pm_suspend(void) omap_uart_prepare_suspend(); omap3_intc_suspend(); - omap_sram_idle(); + trace_pm_suspend_entry(); + save_sync_trace_clock(); + omap_sram_idle(); + resync_trace_clock(); + trace_pm_suspend_exit(); restore: /* Restore next_pwrsts */ diff --git a/arch/arm/mach-omap2/trace-clock.c b/arch/arm/mach-omap2/trace-clock.c new file mode 100644 index 000000000000..3db1cdb8d59d --- /dev/null +++ b/arch/arm/mach-omap2/trace-clock.c @@ -0,0 +1,726 @@ +/* + * arch/arm/mach-omap2/trace-clock.c + * + * Trace clock for ARM OMAP3 + * + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> 2009 + */ + +#include <linux/module.h> +#include <linux/clocksource.h> +#include <linux/timer.h> +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/cpufreq.h> +#include <linux/err.h> + +#include <plat/clock.h> +#include <asm/trace-clock.h> +#include <asm/pmu.h> + +/* depends on CONFIG_OMAP_32K_TIMER */ +/* Need direct access to the clock from arch/arm/mach-omap2/timer-gp.c */ +static struct clocksource *clock; + +DEFINE_PER_CPU(struct pm_save_count, pm_save_count); +EXPORT_PER_CPU_SYMBOL_GPL(pm_save_count); + +static void clear_ccnt_ms(unsigned long data); + +/* According to timer32k.c, this is a 32768Hz clock, not a 32000Hz clock. */ +#define TIMER_32K_FREQ 32768 +#define TIMER_32K_SHIFT 15 + +/* + * Clear ccnt twice per 31-bit overflow, or 4 times per 32-bits period. + */ +static u32 clear_ccnt_interval; + +static DEFINE_SPINLOCK(trace_clock_lock); +static int trace_clock_refcount; + +static int print_info_done; + +static struct platform_device *reserved_pmu; + +static u32 get_mul_fact(u64 max_freq, u64 cur_freq) +{ + u64 rem; + + BUG_ON(cur_freq == 0); + return __iter_div_u64_rem(max_freq << 10, cur_freq, &rem); +} + +/* + * Cycle counter management. + */ + +static inline void write_pmnc(u32 val) +{ + __asm__ __volatile__ ("mcr p15, 0, %0, c9, c12, 0" : : "r" (val)); +} + +static inline u32 read_pmnc(void) +{ + u32 val; + __asm__ __volatile__ ("mrc p15, 0, %0, c9, c12, 0" : "=r" (val)); + return val; +} + +static inline void write_ctens(u32 val) +{ + __asm__ __volatile__ ("mcr p15, 0, %0, c9, c12, 1" : : "r" (val)); +} + +static inline u32 read_ctens(void) +{ + u32 val; + __asm__ __volatile__ ("mrc p15, 0, %0, c9, c12, 1" : "=r" (val)); + return val; +} + +static inline void write_intenc(u32 val) +{ + __asm__ __volatile__ ("mcr p15, 0, %0, c9, c14, 2" : : "r" (val)); +} + +static inline u32 read_intenc(void) +{ + u32 val; + __asm__ __volatile__ ("mrc p15, 0, %0, c9, c14, 2" : "=r" (val)); + return val; +} + +static inline void write_useren(u32 val) +{ + __asm__ __volatile__ ("mcr p15, 0, %0, c9, c14, 0" : : "r" (val)); +} + +static inline u32 read_useren(void) +{ + u32 val; + __asm__ __volatile__ ("mrc p15, 0, %0, c9, c14, 0" : "=r" (val)); + return val; +} + +/* + * Must disable counter before writing to it. + */ +static inline void write_ccnt(u32 val) +{ + __asm__ __volatile__ ("mcr p15, 0, %0, c9, c13, 0" : : "r" (val)); +} + +/* + * Periodical timer handler, clears ccnt most significant bit each half-period + * of 31-bit overflow. Makes sure the ccnt never overflows. + */ +static void clear_ccnt_ms(unsigned long data) +{ + struct pm_save_count *pm_count; + unsigned int cycles; + unsigned long flags; + int cpu; + + cpu = smp_processor_id(); + pm_count = &per_cpu(pm_save_count, cpu); + + local_irq_save(flags); + + if (!pm_count->fast_clock_ready) + goto end; + + isb(); /* clear the pipeline so we can execute ASAP */ + write_ctens(read_ctens() & ~(1 << 31)); /* disable counter */ + cycles = read_ccnt(); + write_ccnt(cycles & ~(1 << 31)); + isb(); + write_ctens(read_ctens() | (1 << 31)); /* enable counter */ + isb(); +end: + local_irq_restore(flags); + + mod_timer_pinned(&pm_count->clear_ccnt_ms_timer, + jiffies + clear_ccnt_interval); +} + +/* + * disabling interrupts to protect against concurrent IPI save/resync. + */ +void save_sync_trace_clock(void) +{ + struct pm_save_count *pm_count; + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + pm_count = &per_cpu(pm_save_count, cpu); + raw_spin_lock(&pm_count->lock); + + if (!pm_count->refcount) + goto end; + + pm_count->ext_32k = clock->read(clock); + pm_count->int_fast_clock = trace_clock_read64(); +end: + raw_spin_unlock(&pm_count->lock); + + /* + * Only enable slow read after saving the clock values. + */ + barrier(); + pm_count->fast_clock_ready = 0; + + /* + * Disable counter to ensure there is no overflow while we are + * keeping track of time with ext. clock. + */ + write_ctens(read_ctens() & ~(1 << 31)); /* disable counter */ + local_irq_restore(flags); +} + +/* + * Called with preemption disabled. Read the external clock source directly + * and return corresponding time in fast clock source time frame. + * Called after time is saved and before it is resynced. + * Also used to periodically resync the drifting dvfs clock on external clock. + */ +u64 _trace_clock_read_slow(void) +{ + struct pm_save_count *pm_count; + u64 ref_time; + unsigned int count_32k; + int cpu; + + cpu = smp_processor_id(); + pm_count = &per_cpu(pm_save_count, cpu); + WARN_ON_ONCE(!pm_count->refcount); + + /* + * Set the timer's value MSBs to the same as current 32K timer. + */ + ref_time = pm_count->int_fast_clock; + if (!pm_count->init_clock) + count_32k = clock->read(clock); + else + count_32k = pm_count->init_clock; + + /* + * Delta done on 32-bits, then casted to u64. Must guarantee + * that we are called often enough so the difference does not + * overflow 32 bits anyway. + */ + ref_time += (u64)(count_32k - pm_count->ext_32k) + * (cpu_hz >> TIMER_32K_SHIFT); + return ref_time; +} +EXPORT_SYMBOL_GPL(_trace_clock_read_slow); + +/* + * resynchronize the per-cpu fast clock with the last save_sync values and the + * external clock. Called from PM (thread) context and IPI context. + */ +void resync_trace_clock(void) +{ + struct pm_save_count *pm_count; + struct tc_cur_freq *new_cf, *cf; + unsigned int new_index, index; + u64 ref_time; + unsigned long flags; + u32 regval; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + pm_count = &per_cpu(pm_save_count, cpu); + raw_spin_lock(&pm_count->lock); + + if (!pm_count->refcount) + goto end; + + /* Let userspace access performance counter registers */ + regval = read_useren(); + regval |= (1 << 0); /* User mode enable */ + write_useren(regval); + + regval = read_intenc(); + regval |= (1 << 31); /* CCNT overflow interrupt disable */ + write_intenc(regval); + + regval = read_pmnc(); + regval |= (1 << 0); /* Enable all counters */ + regval &= ~(1 << 3); /* count every cycles */ + regval &= ~(1 << 5); /* Enable even in non-invasive debug prohib. */ + write_pmnc(regval); + + ref_time = _trace_clock_read_slow(); + + if (pm_count->init_clock) + pm_count->init_clock = 0; + + write_ctens(read_ctens() & ~(1 << 31)); /* disable counter */ + write_ccnt((u32)ref_time & ~(1 << 31)); + write_ctens(read_ctens() | (1 << 31)); /* enable counter */ + + _trace_clock_write_synthetic_tsc(ref_time); + + index = pm_count->index; + new_index = 1 - index; + cf = &pm_count->cf[index]; + new_cf = &pm_count->cf[new_index]; + new_cf->hw_base = ref_time; + new_cf->virt_base = ref_time; + new_cf->cur_cpu_freq = cpufreq_quick_get(cpu); + if (new_cf->cur_cpu_freq == 0) + new_cf->cur_cpu_freq = pm_count->max_cpu_freq; + new_cf->mul_fact = get_mul_fact(pm_count->max_cpu_freq, + new_cf->cur_cpu_freq); + new_cf->floor = max(ref_time, cf->floor); + barrier(); + pm_count->index = new_index; + barrier(); /* make clock ready before enabling */ + pm_count->fast_clock_ready = 1; + + /* Delete resync timer if present. Just done its job anyway. */ + if (pm_count->dvfs_count) + del_timer(&pm_count->clock_resync_timer); + pm_count->dvfs_count = 0; + + if (unlikely(!print_info_done)) { + printk(KERN_INFO "Trace clock using cycle counter at %llu HZ\n" + "saved 32k clk value 0x%08X, " + "saved cycle counter value 0x%016llX\n" + "synthetic value (write, read) 0x%016llX, 0x%016llX\n", + cpu_hz, + pm_count->ext_32k, + pm_count->int_fast_clock, + ref_time, trace_clock_read64()); + printk(KERN_INFO "Reference clock used : %s\n", clock->name); + print_info_done = 1; + } +end: + raw_spin_unlock(&pm_count->lock); + local_irq_restore(flags); +} + +/* + * Called with IRQ and FIQ off. + */ +static void resync_on_32k(struct pm_save_count *pm_count, int cpu, + unsigned int cached_freq, int new_freq) +{ + struct tc_cur_freq *new_cf, *cf; + u64 ref_time; + unsigned int new_index, index; + + index = pm_count->index; + + new_index = 1 - index; + cf = &pm_count->cf[index]; + new_cf = &pm_count->cf[new_index]; + ref_time = _trace_clock_read_slow(); + new_cf->hw_base = trace_clock_read_synthetic_tsc(); + new_cf->virt_base = ref_time; + if (cached_freq) + new_cf->cur_cpu_freq = cf->cur_cpu_freq; + else { + new_cf->cur_cpu_freq = new_freq; + if (new_cf->cur_cpu_freq == 0) + new_cf->cur_cpu_freq = pm_count->max_cpu_freq; + } + new_cf->mul_fact = get_mul_fact(pm_count->max_cpu_freq, + new_cf->cur_cpu_freq); + new_cf->floor = max((((new_cf->hw_base - cf->hw_base) + * cf->mul_fact) >> 10) + cf->virt_base, + cf->floor); + barrier(); + pm_count->index = new_index; +} + +/* + * Timer to resynchronize with ext. 32k clock after DVFS update (but not too + * often if flooded by DVFS updates). + * Necessary to deal with drift caused by DVFS updates. + * Per-cpu timer added by cpu freq events, single-shot. + */ +static void clock_resync_timer_fct(unsigned long data) +{ + struct pm_save_count *pm_count; + unsigned long flags; + int cpu; + + cpu = smp_processor_id(); + pm_count = &per_cpu(pm_save_count, cpu); + + local_irq_save(flags); + local_fiq_disable(); /* disable fiqs for floor value */ + + /* Need to resync if we had more than 1 dvfs event in period */ + if (pm_count->dvfs_count > 1) + resync_on_32k(pm_count, cpu, 1, 0); + pm_count->dvfs_count = 0; + + local_fiq_enable(); + local_irq_restore(flags); +} + +static void prepare_timer(int cpu) +{ + struct pm_save_count *pm_count; + + pm_count = &per_cpu(pm_save_count, cpu); + init_timer_deferrable(&pm_count->clear_ccnt_ms_timer); + pm_count->clear_ccnt_ms_timer.function = clear_ccnt_ms; + pm_count->clear_ccnt_ms_timer.expires = jiffies + clear_ccnt_interval; + + init_timer_deferrable(&pm_count->clock_resync_timer); + pm_count->clock_resync_timer.function = clock_resync_timer_fct; +} + +static void enable_timer(int cpu) +{ + struct pm_save_count *pm_count; + + pm_count = &per_cpu(pm_save_count, cpu); + add_timer_on(&pm_count->clear_ccnt_ms_timer, cpu); +} + +static void disable_timer_ipi(void *info) +{ + save_sync_trace_clock(); +} + +static void disable_timer(int cpu) +{ + struct pm_save_count *pm_count; + + pm_count = &per_cpu(pm_save_count, cpu); + del_timer_sync(&pm_count->clear_ccnt_ms_timer); + if (pm_count->dvfs_count) + del_timer_sync(&pm_count->clock_resync_timer); + smp_call_function_single(cpu, disable_timer_ipi, NULL, 1); +} + +static void resync_ipi(void *info) +{ + resync_trace_clock(); +} + +void _start_trace_clock(void) +{ + struct pm_save_count *pm_count; + u32 ext_32k; + u64 old_fast_clock; + int cpu; + + ext_32k = clock->read(clock); + old_fast_clock = per_cpu(pm_save_count, 0).int_fast_clock; + + for_each_online_cpu(cpu) { + pm_count = &per_cpu(pm_save_count, cpu); + pm_count->ext_32k = ext_32k; + pm_count->int_fast_clock = old_fast_clock; + pm_count->refcount = 1; + pm_count->init_clock = ext_32k; + pm_count->dvfs_count = 0; + } + + on_each_cpu(resync_ipi, NULL, 1); + + get_synthetic_tsc(); + + for_each_online_cpu(cpu) { + prepare_timer(cpu); + enable_timer(cpu); + } +} + +void _stop_trace_clock(void) +{ + struct pm_save_count *pm_count; + int cpu; + + per_cpu(pm_save_count, 0).int_fast_clock = trace_clock_read64(); + + for_each_online_cpu(cpu) { + pm_count = &per_cpu(pm_save_count, cpu); + disable_timer(cpu); + pm_count->refcount = 0; + } + put_synthetic_tsc(); +} + +void start_trace_clock(void) +{ + spin_lock(&trace_clock_lock); + if (!trace_clock_refcount) + goto end; + _start_trace_clock(); +end: + spin_unlock(&trace_clock_lock); +} + +void stop_trace_clock(void) +{ + spin_lock(&trace_clock_lock); + if (!trace_clock_refcount) + goto end; + _stop_trace_clock(); +end: + spin_unlock(&trace_clock_lock); +} + +/* + * hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Start/stop timers for trace clock upon cpu hotplug. + * Also resync the clock. + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + struct pm_save_count *pm_count; + unsigned int hotcpu = (unsigned long)hcpu; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + spin_lock(&trace_clock_lock); + if (trace_clock_refcount) { + pm_count = &per_cpu(pm_save_count, hotcpu); + local_irq_save(flags); + pm_count->ext_32k = clock->read(clock); + pm_count->int_fast_clock = trace_clock_read64(); + local_irq_restore(flags); + pm_count->refcount = 1; + pm_count->dvfs_count = 0; + prepare_timer(hotcpu); + } + spin_unlock(&trace_clock_lock); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&trace_clock_lock); + if (trace_clock_refcount) { + resync_trace_clock(); + enable_timer(hotcpu); + } + spin_unlock(&trace_clock_lock); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + spin_lock(&trace_clock_lock); + if (trace_clock_refcount) + disable_timer(hotcpu); + spin_unlock(&trace_clock_lock); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + spin_lock(&trace_clock_lock); + if (trace_clock_refcount) { + pm_count = &per_cpu(pm_save_count, hotcpu); + pm_count->refcount = 0; + } + spin_unlock(&trace_clock_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +int get_trace_clock(void) +{ + int ret = 0; + + spin_lock(&trace_clock_lock); + if (trace_clock_refcount) + goto end; + reserved_pmu = reserve_pmu(ARM_PMU_DEVICE_CPU); + if (IS_ERR_OR_NULL(reserved_pmu) && PTR_ERR(reserved_pmu) != -ENODEV) { + ret = -EBUSY; + goto end; + } + trace_clock_refcount++; + _start_trace_clock(); +end: + spin_unlock(&trace_clock_lock); + return ret; +} +EXPORT_SYMBOL_GPL(get_trace_clock); + +void put_trace_clock(void) +{ + spin_lock(&trace_clock_lock); + WARN_ON(trace_clock_refcount <= 0); + if (trace_clock_refcount != 1) + goto end; + _stop_trace_clock(); + release_pmu(reserved_pmu); +end: + trace_clock_refcount--; + spin_unlock(&trace_clock_lock); +} +EXPORT_SYMBOL_GPL(put_trace_clock); + +/* + * We do not use prechange hook to sample 2 clock values and average because + * locking wrt other timers can be difficult to get right. + * A bit more imprecision just increases the drift. We have a periodic timer + * in place to resynchronize periodically on the 32k clock anyway. + */ +static int cpufreq_trace_clock(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + struct pm_save_count *pm_count; + struct tc_cur_freq *new_cf, *cf; + unsigned long flags; + unsigned int new_index, index; + u64 post_val; + int cpu; + +#if 0 /* debug trace_mark */ + trace_mark(test, freq_change, + "%s cpu %u oldfreq %u newfreq %u const %u", + (val != CPUFREQ_POSTCHANGE) ? "prechange" : "postchange", + freq->cpu, freq->old, freq->new, + (freq->flags & CPUFREQ_CONST_LOOPS) ? 1 : 0); +#endif + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return 0; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + local_irq_save(flags); + cpu = smp_processor_id(); + WARN_ON_ONCE(cpu != freq->cpu); + pm_count = &per_cpu(pm_save_count, cpu); + raw_spin_lock(&pm_count->lock); + + if (!pm_count->refcount) + goto end; + + /* + * Disable FIQs to ensure the floor value is indeed the + * floor. + */ + local_fiq_disable(); + + if (!pm_count->dvfs_count) { + resync_on_32k(pm_count, cpu, 0, freq->new); + pm_count->clock_resync_timer.expires = jiffies + + (TC_RESYNC_PERIOD * HZ / 1000); + add_timer_on(&pm_count->clock_resync_timer, cpu); + } else { + post_val = trace_clock_read_synthetic_tsc(); + /* disable irqs to ensure we are the only value modifier */ + index = pm_count->index; + new_index = 1 - index; + cf = &pm_count->cf[index]; + new_cf = &pm_count->cf[new_index]; + new_cf->hw_base = post_val; + new_cf->virt_base = (((post_val - cf->hw_base) + * cf->mul_fact) >> 10) + cf->virt_base; + new_cf->cur_cpu_freq = freq->new; + new_cf->mul_fact = get_mul_fact(pm_count->max_cpu_freq, + freq->new); + new_cf->floor = max((((post_val - cf->hw_base) + * cf->mul_fact) >> 10) + cf->virt_base, + cf->floor); + barrier(); + pm_count->index = new_index; + } + + local_fiq_enable(); + pm_count->dvfs_count++; +end: + raw_spin_unlock(&pm_count->lock); + local_irq_restore(flags); + return 0; +} + +static struct notifier_block cpufreq_trace_clock_nb = { + .notifier_call = cpufreq_trace_clock, +}; + +#ifdef CONFIG_DEBUG_TRACE_CLOCK +/* + * Clock expected to never overflow and never go backward. + */ +static DEFINE_PER_CPU(u64, last_clock_value); +static DEFINE_PER_CPU(u32, last_ccnt_value); +DEFINE_PER_CPU(unsigned int, last_clock_nest); +EXPORT_PER_CPU_SYMBOL_GPL(last_clock_nest); + +static int tc_print_done; + +/* + * Called with interrupts disabled. + */ +void trace_clock_debug(u64 value) +{ + int cpu; + + cpu = smp_processor_id(); + if (unlikely(per_cpu(last_clock_nest, cpu) != 1)) + return; /* fiq nesting, don't perform racy check */ + if (unlikely(!tc_print_done + && (per_cpu(last_clock_value, cpu) > value))) { + printk(KERN_WARNING "Trace clock going back last %llu new %llu " + "diff %llu last_ccnt %u ccnt %u\n", + (unsigned long long) per_cpu(last_clock_value, cpu), + (unsigned long long) value, + (unsigned long long) per_cpu(last_clock_value, cpu) + - value, + per_cpu(last_ccnt_value, cpu), + trace_clock_read32()); + tc_print_done = 1; + } + per_cpu(last_clock_value, cpu) = value; + per_cpu(last_ccnt_value, cpu) = trace_clock_read32();; +} +EXPORT_SYMBOL_GPL(trace_clock_debug); +#endif + +static __init int init_trace_clock(void) +{ + int cpu, ret; + u64 rem; + + ret = init_pmu(ARM_PMU_DEVICE_CPU); + if (ret && ret != -ENODEV) + return ret; + clock = get_clocksource_32k(); + /* + * clear_ccnt_interval based on the cpu fastest frequency. Never + * recomputed. + */ + clear_ccnt_interval = __iter_div_u64_rem(HZ * (1ULL << 30), cpu_hz, + &rem); + printk(KERN_INFO "LTTng will clear ccnt top bit every %u jiffies.\n", + clear_ccnt_interval); + for_each_possible_cpu(cpu) { + per_cpu(pm_save_count, cpu).max_cpu_freq = + __iter_div_u64_rem(cpu_hz, 1000, &rem); + per_cpu(pm_save_count, cpu).lock = + __RAW_SPIN_LOCK_UNLOCKED(per_cpu(pm_save_count, + cpu).lock); + } + hotcpu_notifier(hotcpu_callback, 4); + cpufreq_register_notifier(&cpufreq_trace_clock_nb, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} +__initcall(init_trace_clock); diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig index b6333ae3f92a..99593d71a855 100644 --- a/arch/arm/plat-omap/Kconfig +++ b/arch/arm/plat-omap/Kconfig @@ -17,6 +17,10 @@ config ARCH_OMAP1 config ARCH_OMAP2PLUS bool "TI OMAP2/3/4" + select COMMON_CLKDEV + select HAVE_TRACE_CLOCK + select HAVE_TRACE_CLOCK_32_TO_64 + select OMAP_32K_TIMER select CLKDEV_LOOKUP select OMAP_DM_TIMER help diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c index 862dda95d61d..8627a5166886 100644 --- a/arch/arm/plat-omap/counter_32k.c +++ b/arch/arm/plat-omap/counter_32k.c @@ -107,6 +107,11 @@ static struct clocksource clocksource_32k = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +struct clocksource *get_clocksource_32k(void) +{ + return &clocksource_32k; +} + /* * Returns current time from boot in nsecs. It's OK for this to wrap * around for now, as it's just a relative time stamp. diff --git a/arch/arm/plat-omap/include/plat/clock.h b/arch/arm/plat-omap/include/plat/clock.h index 8eb0adab19ea..2ad01f37a3c0 100644 --- a/arch/arm/plat-omap/include/plat/clock.h +++ b/arch/arm/plat-omap/include/plat/clock.h @@ -297,4 +297,6 @@ extern const struct clkops clkops_null; extern struct clk dummy_ck; +struct clocksource *get_clocksource_32k(void); + #endif diff --git a/arch/arm/plat-omap/include/plat/trace-clock.h b/arch/arm/plat-omap/include/plat/trace-clock.h new file mode 100644 index 000000000000..7fcdbf98063c --- /dev/null +++ b/arch/arm/plat-omap/include/plat/trace-clock.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) 2009 Mathieu Desnoyers + * + * Trace clock ARM OMAP3 definitions. + */ + +#ifndef _ASM_ARM_TRACE_CLOCK_OMAP3_H +#define _ASM_ARM_TRACE_CLOCK_OMAP3_H + +#include <linux/clk.h> +#include <linux/timer.h> +#include <linux/percpu.h> +#include <plat/clock.h> + +/* + * Number of hardware clock bits. The higher order bits are expected to be 0. + * If the hardware clock source has more than 32 bits, the bits higher than the + * 32nd will be truncated by a cast to a 32 bits unsigned. Range : 1 - 32. + * (too few bits would be unrealistic though, since we depend on the timer to + * detect the overflows). + * OMAP3-specific : we clear bit 31 periodically so it never overflows. There + * is a hardware bug with CP14 and CP15 being executed at the same time a ccnt + * overflow occurs. + * + * Siarhei Siamashka <siarhei.siamashka@nokia.com> : + * Performance monitoring unit breaks if somebody is accessing CP14/CP15 + * coprocessor register exactly at the same time as CCNT overflows (regardless + * of the fact if generation of interrupts is enabled or not). A workaround + * suggested by ARM was to never allow it to overflow and reset it + * periodically. + */ +#define TC_HW_BITS 31 + +/* Expected maximum interrupt latency in ms : 15ms, *2 for security */ +#define TC_EXPECTED_INTERRUPT_LATENCY 30 + +/* Resync with 32k clock each 100ms */ +#define TC_RESYNC_PERIOD 100 + +struct tc_cur_freq { + u64 cur_cpu_freq; /* in khz */ + /* cur time : (now - base) * (max_freq / cur_freq) + base */ + u32 mul_fact; /* (max_cpu_freq << 10) / cur_freq */ + u64 hw_base; /* stamp of last cpufreq change, hw cycles */ + u64 virt_base; /* same as above, virtual trace clock cycles */ + u64 floor; /* floor value, so time never go back */ +}; + +/* 32KHz counter per-cpu count save upon PM sleep and cpufreq management */ +struct pm_save_count { + struct tc_cur_freq cf[2]; /* rcu-protected */ + unsigned int index; /* tc_cur_freq current read index */ + /* + * Is fast clock ready to be read ? Read with preemption off. Modified + * only by local CPU in thread and interrupt context or by start/stop + * when time is not read concurrently. + */ + int fast_clock_ready; + + u64 int_fast_clock; + struct timer_list clear_ccnt_ms_timer; + struct timer_list clock_resync_timer; + u32 ext_32k; + int refcount; + u32 init_clock; + raw_spinlock_t lock; /* spinlock only sync the refcount */ + unsigned int dvfs_count; /* Number of DVFS updates in period */ + /* cpufreq management */ + u64 max_cpu_freq; /* in khz */ +}; + +DECLARE_PER_CPU(struct pm_save_count, pm_save_count); + +extern u64 trace_clock_read_synthetic_tsc(void); +extern void _trace_clock_write_synthetic_tsc(u64 value); +extern unsigned long long cpu_hz; + +DECLARE_PER_CPU(int, fast_clock_ready); +extern u64 _trace_clock_read_slow(void); + +/* + * ARM OMAP3 timers only return 32-bits values. We ened to extend it to a + * 64-bit value, which is provided by trace-clock-32-to-64. + */ +extern u64 trace_clock_async_tsc_read(void); +/* + * Update done by the architecture upon wakeup. + */ +extern void _trace_clock_write_synthetic_tsc(u64 value); + +#ifdef CONFIG_DEBUG_TRACE_CLOCK +DECLARE_PER_CPU(unsigned int, last_clock_nest); +extern void trace_clock_debug(u64 value); +#else +static inline void trace_clock_debug(u64 value) +{ +} +#endif + +static inline u32 read_ccnt(void) +{ + u32 val; + __asm__ __volatile__ ("mrc p15, 0, %0, c9, c13, 0" : "=r" (val)); + return val & ~(1 << TC_HW_BITS); +} + +static inline u32 trace_clock_read32(void) +{ + u32 val; + + isb(); + val = read_ccnt(); + isb(); + return val; +} + +static inline u64 trace_clock_read64(void) +{ + struct pm_save_count *pm_count; + struct tc_cur_freq *cf; + u64 val; +#ifdef CONFIG_DEBUG_TRACE_CLOCK + unsigned long flags; + + local_irq_save(flags); + per_cpu(last_clock_nest, smp_processor_id())++; + barrier(); +#endif + + preempt_disable(); + pm_count = &per_cpu(pm_save_count, smp_processor_id()); + if (likely(pm_count->fast_clock_ready)) { + cf = &pm_count->cf[ACCESS_ONCE(pm_count->index)]; + val = max((((trace_clock_read_synthetic_tsc() - cf->hw_base) + * cf->mul_fact) >> 10) + cf->virt_base, cf->floor); + } else + val = _trace_clock_read_slow(); + trace_clock_debug(val); + preempt_enable(); + +#ifdef CONFIG_DEBUG_TRACE_CLOCK + barrier(); + per_cpu(last_clock_nest, smp_processor_id())--; + local_irq_restore(flags); +#endif + return val; +} + +static inline u64 trace_clock_frequency(void) +{ + return cpu_hz; +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +extern int get_trace_clock(void); +extern void put_trace_clock(void); +extern void get_synthetic_tsc(void); +extern void put_synthetic_tsc(void); + +extern void resync_trace_clock(void); +extern void save_sync_trace_clock(void); +extern void start_trace_clock(void); +extern void stop_trace_clock(void); + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_MIPS_TRACE_CLOCK_OMAP3_H */ diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index 7a9c03dcb0b6..6e882a9584ef 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -66,7 +66,7 @@ static inline struct thread_info *current_thread_info(void) #endif /* !__ASSEMBLY__ */ -#define PREEMPT_ACTIVE 0x40000000 +#define PREEMPT_ACTIVE 0x10000000 /* * Thread information flags @@ -85,6 +85,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_RESTORE_SIGMASK 7 /* restore signal mask in do_signal */ #define TIF_CPU_GOING_TO_SLEEP 8 /* CPU is entering sleep 0 mode */ #define TIF_NOTIFY_RESUME 9 /* callback before returning to user */ +#define TIF_KERNEL_TRACE 10 /* kernel trace active */ #define TIF_FREEZE 29 #define TIF_DEBUG 30 /* debugging enabled */ #define TIF_USERSPACE 31 /* true if FS sets userspace */ @@ -93,28 +94,32 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_BREAKPOINT (1 << TIF_BREAKPOINT) #define _TIF_SINGLE_STEP (1 << TIF_SINGLE_STEP) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_DEBUG (1 << TIF_DEBUG) +#define _TIF_USERSPACE (1 << TIF_USERSPACE) /* Note: The masks below must never span more than 16 bits! */ /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ - ((1 << TIF_SIGPENDING) \ + (_TIF_SIGPENDING \ | _TIF_NOTIFY_RESUME \ - | (1 << TIF_NEED_RESCHED) \ - | (1 << TIF_POLLING_NRFLAG) \ - | (1 << TIF_BREAKPOINT) \ - | (1 << TIF_RESTORE_SIGMASK)) + | _TIF_NEED_RESCHED \ + | _TIF_POLLING_NRFLAG \ + | _TIF_BREAKPOINT \ + | _TIF_RESTORE_SIGMASK) /* work to do on any return to userspace */ -#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | (1 << TIF_SYSCALL_TRACE) | \ - _TIF_NOTIFY_RESUME) +#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | _TIF_SYSCALL_TRACE | \ + _TIF_NOTIFY_RESUME | _TIF_KERNEL_TRACE) /* work to do on return from debug mode */ -#define _TIF_DBGWORK_MASK (_TIF_WORK_MASK & ~(1 << TIF_BREAKPOINT)) +#define _TIF_DBGWORK_MASK (_TIF_WORK_MASK & ~_TIF_BREAKPOINT) #endif /* __ASM_AVR32_THREAD_INFO_H */ diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 02560fd8a121..510e54bfd0a5 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -102,8 +102,9 @@ static inline struct thread_info *current_thread_info(void) #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ #define TIF_FREEZE 6 /* is freezing for suspend */ #define TIF_IRQ_SYNC 7 /* sync pipeline stage */ -#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ -#define TIF_SINGLESTEP 9 +#define TIF_KERNEL_TRACE 8 /* kernel trace active */ +#define TIF_NOTIFY_RESUME 9 /* callback before returning to user */ +#define TIF_SINGLESTEP 10 /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -115,8 +116,9 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_IRQ_SYNC (1<<TIF_IRQ_SYNC) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK 0x0000FEFE /* work to do on interrupt/exception return */ #endif /* __KERNEL__ */ diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 91776069ca80..bc2024dbe32b 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* resumption notification requested */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +#define TIF_KERNEL_TRACE 4 /* kernel trace active */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ @@ -92,12 +93,16 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_FREEZE (1<<TIF_FREEZE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ -#define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK 0x0000FFFF & \ + (~_TIF_SYSCALL_TRACE | ~_TIF_KERNEL_TRACE) +/* work to do on any return to u-space */ +#define _TIF_ALLWORK_MASK 0x0000FFFF #endif /* __KERNEL__ */ diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index 11f33ead29bf..8adf256d213f 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -112,6 +112,7 @@ register struct thread_info *__current_thread_info asm("gr15"); #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ +#define TIF_KERNEL_TRACE 6 /* kernel trace active */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_FREEZE 18 /* freezing for suspend */ @@ -122,10 +123,11 @@ register struct thread_info *__current_thread_info asm("gr15"); #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK 0x0000FFBE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ /* diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index d6f1784bfdee..65685fa45542 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -90,18 +90,20 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 4 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ #define TIF_NOTIFY_RESUME 6 /* callback before returning to user */ +#define TIF_KERNEL_TRACE 7 /* kernel trace active */ #define TIF_FREEZE 16 /* is freezing for suspend */ /* as above, but as bit values */ -#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) -#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) -#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) -#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_FREEZE (1<<TIF_FREEZE) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) +#define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK 0x0000FFBE /* work to do on interrupt/exception return */ #endif /* __KERNEL__ */ diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index b6a5ba2aca34..3206bb5575ba 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -100,6 +100,7 @@ struct thread_info { #define TIF_SYSCALL_TRACE 2 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ +#define TIF_KERNEL_TRACE 5 /* kernel trace active */ #define TIF_NOTIFY_RESUME 6 /* resumption notification requested */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ @@ -111,7 +112,9 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_SYSCALL_TRACEAUDIT (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) +#define _TIF_SYSCALL_TRACEAUDIT (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|\ + _TIF_SINGLESTEP|_TIF_KERNEL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -124,8 +127,9 @@ struct thread_info { /* "work to do on user-return" bits */ #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ _TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE) -/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ -#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) +/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE, TIF_KERNEL_TRACE or TIF_SYSCALL_AUDIT */ +#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE|\ + _TIF_SYSCALL_AUDIT)) #define TS_POLLING 1 /* true if in idle loop and not sleeping */ #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 244704a174de..56c4de30197c 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -620,9 +620,11 @@ GLOBAL_ENTRY(ia64_ret_from_clone) ;; ld4 r2=[r2] ;; + movl r8=_TIF_SYSCALL_TRACEAUDIT + ;; // added stop bits to prevent r8 dependency + and r2=r8,r2 mov r8=0 - and r2=_TIF_SYSCALL_TRACEAUDIT,r2 - ;; + ;; // added stop bits to prevent r2 dependency cmp.ne p6,p0=r2,r0 (p6) br.cond.spnt .strace_check_retval ;; // added stop bits to prevent r8 dependency diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 71faff5bcc27..8538b1a0eaf5 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -139,6 +139,7 @@ static inline unsigned int get_thread_fault_code(void) #define TIF_SINGLESTEP 3 /* restore singlestep on return to user mode */ #define TIF_IRET 4 /* return with iret */ #define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ +#define TIF_KERNEL_TRACE 6 /* kernel trace active */ #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -150,14 +151,19 @@ static inline unsigned int get_thread_fault_code(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) #define _TIF_IRET (1<<TIF_IRET) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_USEDFPU (1<<TIF_USEDFPU) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_FREEZE (1<<TIF_FREEZE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ -#define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ +/* work to do on any return to u-space */ +#define _TIF_ALLWORK_MASK 0x0000FFFF + +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK \ + (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | _TIF_KERNEL_TRACE)) /* * Thread-synchronous status. diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 790988967ba7..fa8256a17ebb 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -100,6 +100,7 @@ static inline struct thread_info *current_thread_info(void) */ #define TIF_SIGPENDING 6 /* signal pending */ #define TIF_NEED_RESCHED 7 /* rescheduling necessary */ +#define TIF_KERNEL_TRACE 13 /* kernel trace active */ #define TIF_DELAYED_TRACE 14 /* single step a syscall */ #define TIF_SYSCALL_TRACE 15 /* syscall trace active */ #define TIF_MEMDIE 16 /* is terminating due to OOM killer */ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f5ecc0566bc2..e0c246d26e11 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -17,6 +17,7 @@ config MIPS select HAVE_KRETPROBES select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT + select HAVE_LTT_DUMP_TABLES select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS @@ -1963,6 +1964,20 @@ config CPU_R4000_WORKAROUNDS config CPU_R4400_WORKAROUNDS bool +config HAVE_GET_CYCLES_32 + def_bool y + depends on !CPU_R4400_WORKAROUNDS + depends on !CPU_CAVIUM_OCTEON + select HAVE_TRACE_CLOCK + select HAVE_TRACE_CLOCK_32_TO_64 + select HAVE_UNSYNCHRONIZED_TSC + +config HAVE_GET_CYCLES + def_bool y + depends on CPU_CAVIUM_OCTEON + select HAVE_TRACE_CLOCK + select HAVE_UNSYNCHRONIZED_TSC + # # - Highmem only makes sense for the 32-bit kernel. # - The current highmem code will only work properly on physically indexed diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h index c0884f02d3a6..1419b787e1e2 100644 --- a/arch/mips/include/asm/barrier.h +++ b/arch/mips/include/asm/barrier.h @@ -178,4 +178,10 @@ #define nudge_writes() mb() #endif +/* + * MIPS does not have any instruction to serialize instruction execution on the + * core. + */ +#define sync_core() + #endif /* __ASM_BARRIER_H */ diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index 4d9870975382..44f631b39ffc 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -438,6 +438,7 @@ */ #define CAUSEB_EXCCODE 2 #define CAUSEF_EXCCODE (_ULCAST_(31) << 2) +#define CAUSE_EXCCODE(cause) (((cause) & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE) #define CAUSEB_IP 8 #define CAUSEF_IP (_ULCAST_(255) << 8) #define CAUSEB_IP0 8 diff --git a/arch/mips/include/asm/octeon/trace-clock.h b/arch/mips/include/asm/octeon/trace-clock.h new file mode 100644 index 000000000000..062662b732a2 --- /dev/null +++ b/arch/mips/include/asm/octeon/trace-clock.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2005,2008 Mathieu Desnoyers + * + * Trace clock MIPS Octeon definitions. + */ + +#ifndef _ASM_MIPS_OCTEON_TRACE_CLOCK_H +#define _ASM_MIPS_OCTEON_TRACE_CLOCK_H + +#include <asm/octeon/octeon.h> + +#define TC_HW_BITS 64 + +static inline u32 trace_clock_read32(void) +{ + return (u32)read_c0_cvmcount(); /* only need the 32 LSB */ +} + +static inline u64 trace_clock_read64(void) +{ + return read_c0_cvmcount(); +} + +static inline u64 trace_clock_frequency(void) +{ + return octeon_get_clock_rate(); +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +static inline int get_trace_clock(void) +{ + return 0; +} + +static inline void put_trace_clock(void) +{ + return; +} +#endif /* _ASM_MIPS_OCTEON_TRACE_CLOCK_H */ diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index d309556cacf8..eb7f7b99038a 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -122,6 +122,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define TIF_32BIT_ADDR 23 /* 32-bit address space (o32/n32) */ #define TIF_FPUBOUND 24 /* thread bound to FPU-full CPU set */ #define TIF_LOAD_WATCH 25 /* If set, load watch registers */ +#define TIF_KERNEL_TRACE 30 /* kernel trace active */ #define TIF_SYSCALL_TRACE 31 /* syscall trace active */ #ifdef CONFIG_MIPS32_O32 @@ -131,6 +132,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #endif /* CONFIG_MIPS32_O32 */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) @@ -151,7 +153,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define _TIF_WORK_MASK (0x0000ffef & \ ~(_TIF_SECCOMP | _TIF_SYSCALL_AUDIT)) /* work to do on any return to u-space */ -#define _TIF_ALLWORK_MASK (0x8000ffff & ~_TIF_SECCOMP) +#define _TIF_ALLWORK_MASK (0xc000ffff & ~_TIF_SECCOMP) #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/timex.h b/arch/mips/include/asm/timex.h index 6529704aa73a..6c150979fa24 100644 --- a/arch/mips/include/asm/timex.h +++ b/arch/mips/include/asm/timex.h @@ -20,6 +20,8 @@ */ #define CLOCK_TICK_RATE 1193182 +extern unsigned int mips_hpt_frequency; + /* * Standard way to access the cycle counter. * Currently only used on SMP for scheduling. @@ -29,14 +31,109 @@ * which isn't an evil thing. * * We know that all SMP capable CPUs have cycle counters. + * + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * HAVE_GET_CYCLES makes sure that this case is handled properly : + * + * Ralf Baechle <ralf@linux-mips.org> : + * This avoids us executing an mfc0 c0_count instruction on processors which + * don't have but also on certain R4000 and R4400 versions where reading from + * the count register just in the very moment when its value equals c0_compare + * will result in the timer interrupt getting lost. */ +#ifdef CONFIG_HAVE_GET_CYCLES +# ifdef CONFIG_CPU_CAVIUM_OCTEON +typedef unsigned long cycles_t; + +static inline cycles_t get_cycles(void) +{ + return read_c0_cvmcount(); +} + +static inline void get_cycles_barrier(void) +{ +} + +static inline cycles_t get_cycles_rate(void) +{ + return mips_hpt_frequency; +} + +extern int test_tsc_synchronization(void); +extern int _tsc_is_sync; +static inline int tsc_is_sync(void) +{ + return _tsc_is_sync; +} +# else /* #ifdef CONFIG_CPU_CAVIUM_OCTEON */ +# error "64-bit get_cycles() supported only on Cavium Octeon MIPS architectures" +# endif /* #else #ifdef CONFIG_CPU_CAVIUM_OCTEON */ +#elif defined(CONFIG_HAVE_GET_CYCLES_32) typedef unsigned int cycles_t; static inline cycles_t get_cycles(void) { + return read_c0_count(); +} + +static inline void get_cycles_barrier(void) +{ +} + +static inline cycles_t get_cycles_rate(void) +{ + return mips_hpt_frequency; +} + +extern int test_tsc_synchronization(void); +extern int _tsc_is_sync; +static inline int tsc_is_sync(void) +{ + return _tsc_is_sync; +} +#else +typedef unsigned int cycles_t; + +static inline cycles_t get_cycles(void) +{ + return 0; +} +static inline int test_tsc_synchronization(void) +{ return 0; } +static inline int tsc_is_sync(void) +{ + return 0; +} +#endif + +#define DELAY_INTERRUPT 100 +/* + * Only updates 32 LSB. + */ +static inline void write_tsc(u32 val1, u32 val2) +{ + write_c0_count(val1); + /* Arrange for an interrupt in a short while */ + write_c0_compare(read_c0_count() + DELAY_INTERRUPT); +} + +/* + * Currently unused, should update internal tsc-related timekeeping sources. + */ +static inline void mark_tsc_unstable(char *reason) +{ +} + +/* + * Currently simply use the tsc_is_sync value. + */ +static inline int unsynchronized_tsc(void) +{ + return !tsc_is_sync(); +} #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/trace-clock.h b/arch/mips/include/asm/trace-clock.h new file mode 100644 index 000000000000..9bbcf999befb --- /dev/null +++ b/arch/mips/include/asm/trace-clock.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2005,2008 Mathieu Desnoyers + * + * Trace clock MIPS definitions. + */ + +#ifndef _ASM_MIPS_TRACE_CLOCK_H +#define _ASM_MIPS_TRACE_CLOCK_H + +#include <linux/timex.h> +#include <asm/processor.h> + +#define TRACE_CLOCK_MIN_PROBE_DURATION 200 + +#ifdef CONFIG_CPU_CAVIUM_OCTEON +# include <asm/octeon/trace-clock.h> +#else /* !CONFIG_CPU_CAVIUM_OCTEON */ +/* + * Number of hardware clock bits. The higher order bits are expected to be 0. + * If the hardware clock source has more than 32 bits, the bits higher than the + * 32nd will be truncated by a cast to a 32 bits unsigned. Range : 1 - 32. + * (too few bits would be unrealistic though, since we depend on the timer to + * detect the overflows). + */ +#define TC_HW_BITS 32 + +/* Expected maximum interrupt latency in ms : 15ms, *2 for security */ +#define TC_EXPECTED_INTERRUPT_LATENCY 30 + +extern u64 trace_clock_read_synthetic_tsc(void); + +/* + * MIPS get_cycles only returns a 32 bits TSC (see timex.h). The assumption + * there is that the reschedule is done every 8 seconds or so. Given that + * tracing needs to detect delays longer than 8 seconds, we need a full 64-bits + * TSC, whic is provided by trace-clock-32-to-64. +*/ + +static inline u32 trace_clock_read32(void) +{ + return (u32)get_cycles(); /* only need the 32 LSB */ +} + +static inline u64 trace_clock_read64(void) +{ + return trace_clock_read_synthetic_tsc(); +} + +static inline u64 trace_clock_frequency(void) +{ + return get_cycles_rate(); +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +extern void get_synthetic_tsc(void); +extern void put_synthetic_tsc(void); + +static inline int get_trace_clock(void) +{ + get_synthetic_tsc(); + return 0; +} + +static inline void put_trace_clock(void) +{ + put_synthetic_tsc(); +} +#endif /* CONFIG_CPU_CAVIUM_OCTEON */ + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_MIPS_TRACE_CLOCK_H */ diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S index ffa331029e08..8c5410f97e7c 100644 --- a/arch/mips/kernel/entry.S +++ b/arch/mips/kernel/entry.S @@ -167,7 +167,7 @@ work_notifysig: # deal with pending signals and FEXPORT(syscall_exit_work_partial) SAVE_STATIC syscall_exit_work: - li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT + li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_KERNEL_TRACE and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? local_irq_enable # could let do_syscall_trace() diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 876a75cc376f..76a82609626f 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -34,6 +34,7 @@ #include <linux/vfs.h> #include <linux/ipc.h> #include <linux/slab.h> +#include <trace/ipc.h> #include <net/sock.h> #include <net/scm.h> @@ -44,6 +45,8 @@ #include <asm/mmu_context.h> #include <asm/mman.h> +DEFINE_TRACE(ipc_call); + /* Use this to get at 32-bit user passed pointers. */ /* A() macro should be used for places where you e.g. have some internal variable u32 and just want to get @@ -166,6 +169,8 @@ SYSCALL_DEFINE6(32_ipc, u32, call, long, first, long, second, long, third, version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; + trace_ipc_call(call, first); + switch (call) { case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index ae167df73ddd..7d9bb1cdd7f9 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -25,6 +25,7 @@ #include <linux/completion.h> #include <linux/kallsyms.h> #include <linux/random.h> +#include <trace/sched.h> #include <asm/asm.h> #include <asm/bootinfo.h> @@ -42,6 +43,8 @@ #include <asm/inst.h> #include <asm/stacktrace.h> +DEFINE_TRACE(sched_kthread_create); + /* * The idle thread. There's no useful work to be done, so just try to conserve * power and have a low exit latency (ie sit in a loop waiting for somebody to @@ -234,6 +237,7 @@ static void __noreturn kernel_thread_helper(void *arg, int (*fn)(void *)) long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; + long pid; memset(®s, 0, sizeof(regs)); @@ -249,7 +253,10 @@ long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) #endif /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, + 0, ®s, 0, NULL, NULL); + trace_sched_kthread_create(fn, pid); + return pid; } /* diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index d21c388c0116..79e1750cc7cc 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -25,6 +25,7 @@ #include <linux/security.h> #include <linux/audit.h> #include <linux/seccomp.h> +#include <trace/syscall.h> #include <asm/byteorder.h> #include <asm/cpu.h> @@ -39,6 +40,9 @@ #include <asm/bootinfo.h> #include <asm/reg.h> +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + /* * Called by kernel/ptrace.c when detaching.. * @@ -535,6 +539,11 @@ static inline int audit_arch(void) */ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) { + if (!entryexit) + trace_syscall_entry(regs, regs->regs[2]); + else + trace_syscall_exit(regs->regs[2]); + /* do the secure computing check first */ if (!entryexit) secure_computing(regs->regs[2]); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index fbaabad0e6e2..1b90e8255dab 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -52,7 +52,7 @@ NESTED(handle_sys, PT_SIZE, sp) stack_done: lw t0, TI_FLAGS($28) # syscall tracing enabled? - li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT + li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_KERNEL_TRACE and t0, t1 bnez t0, syscall_trace_entry # -> yes diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 3f4179283207..c574a1a12f21 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -54,7 +54,7 @@ NESTED(handle_sys64, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting - li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT + li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_KERNEL_TRACE LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 bnez t0, syscall_trace_entry @@ -126,7 +126,8 @@ illegal_syscall: END(handle_sys64) .align 3 -sys_call_table: + .type sys_call_table,@object +EXPORT(sys_call_table) PTR sys_read /* 5000 */ PTR sys_write PTR sys_open diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index f08ece6d8acc..0d312c2d54d2 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -53,7 +53,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting - li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT + li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_KERNEL_TRACE LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 bnez t0, n32_syscall_trace_entry @@ -121,6 +121,8 @@ not_n32_scall: END(handle_sysn32) + .align 3 + .type sysn32_call_table,@object EXPORT(sysn32_call_table) PTR sys_read /* 6000 */ PTR sys_write diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 78d768a3e19d..635d0d84344e 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -53,7 +53,7 @@ NESTED(handle_sys, PT_SIZE, sp) sll a3, a3, 0 dsll t0, v0, 3 # offset into table - ld t2, (sys_call_table - (__NR_O32_Linux * 8))(t0) + ld t2, (syso32_call_table - (__NR_O32_Linux * 8))(t0) sd a3, PT_R26(sp) # save a3 for syscall restarting @@ -81,7 +81,7 @@ NESTED(handle_sys, PT_SIZE, sp) PTR 4b, bad_stack .previous - li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT + li t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_KERNEL_TRACE LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 bnez t0, trace_a_syscall @@ -180,7 +180,7 @@ LEAF(sys32_syscall) beqz t0, einval # do not recurse dsll t1, t0, 3 beqz v0, einval - ld t2, sys_call_table(t1) # syscall routine + ld t2, syso32_call_table(t1) # syscall routine move a0, a1 # shift argument registers move a1, a2 @@ -202,8 +202,8 @@ einval: li v0, -ENOSYS END(sys32_syscall) .align 3 - .type sys_call_table,@object -sys_call_table: + .type syso32_call_table,@object +EXPORT(syso32_call_table) PTR sys32_syscall /* 4000 */ PTR sys_exit PTR sys_fork diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 383aeb95cb49..3faf9d20ee62 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -164,6 +164,9 @@ void __init smp_cpus_done(unsigned int max_cpus) { mp_ops->cpus_done(); synchronise_count_master(); +#ifdef CONFIG_HAVE_UNSYNCHRONIZED_TSC + test_tsc_synchronization(); +#endif } /* called from main before smp_init() */ diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 1dc6edff45e0..9965dedbcc15 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -31,6 +31,8 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/elf.h> +#include <linux/ipc.h> +#include <linux/kallsyms.h> #include <asm/asm.h> #include <asm/branch.h> @@ -464,3 +466,67 @@ int kernel_execve(const char *filename, return -__v0; } + +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + +#ifdef CONFIG_32BIT + for (i = 0; i < __NR_O32_Linux_syscalls; i++) { + extern struct { + unsigned long ptr; + long j; + } sys_call_table[]; + + sprint_symbol(namebuf, sys_call_table[i].ptr); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i + __NR_O32_Linux, (void *)sys_call_table[i].ptr, + namebuf); + } +#endif +#ifdef CONFIG_64BIT +# ifdef CONFIG_MIPS32_O32 + for (i = 0; i < __NR_O32_Linux_syscalls; i++) { + extern unsigned long syso32_call_table[]; + + sprint_symbol(namebuf, syso32_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i + __NR_O32_Linux, (void *)syso32_call_table[i], + namebuf); + } +# endif + + for (i = 0; i < __NR_64_Linux_syscalls; i++) { + extern unsigned long sys_call_table[]; + + sprint_symbol(namebuf, sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i + __NR_64_Linux, (void *)sys_call_table[i], + namebuf); + } + +# ifdef CONFIG_MIPS32_N32 + for (i = 0; i < __NR_N32_Linux_syscalls; i++) { + extern unsigned long sysn32_call_table[]; + + sprint_symbol(namebuf, sysn32_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i + __NR_N32_Linux, (void *)sysn32_call_table[i], + namebuf); + } +# endif +#endif +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); + +void ltt_dump_idt_table(void *call_data) +{ + /* No IDT information yet. */ + return; +} +EXPORT_SYMBOL_GPL(ltt_dump_idt_table); diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index fb7497405510..51561a75dcf5 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq); */ unsigned int mips_hpt_frequency; +EXPORT_SYMBOL(mips_hpt_frequency); /* * This function exists in order to cause an error due to a duplicate diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 71350f7f2d88..b6a12d70e8c6 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -30,6 +30,7 @@ #include <linux/kdb.h> #include <linux/irq.h> #include <linux/perf_event.h> +#include <trace/trap.h> #include <asm/bootinfo.h> #include <asm/branch.h> @@ -55,6 +56,12 @@ #include <asm/stacktrace.h> #include <asm/uasm.h> +/* + * Also used in unaligned.c and fault.c. + */ +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + extern void check_wait(void); extern asmlinkage void r4k_wait(void); extern asmlinkage void rollback_handle_int(void); @@ -321,7 +328,7 @@ static void __show_regs(const struct pt_regs *regs) printk("Cause : %08x\n", cause); - cause = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE; + cause = CAUSE_EXCCODE(cause); if (1 <= cause && cause <= 5) printk("BadVA : %0*lx\n", field, regs->cp0_badvaddr); @@ -698,6 +705,7 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31) return; die_if_kernel("FP exception in kernel code", regs); + trace_trap_entry(regs, CAUSE_EXCCODE(regs->cp0_cause)); if (fcr31 & FPU_CSR_UNI_X) { int sig; void __user *fault_addr = NULL; @@ -730,7 +738,7 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31) /* If something went wrong, signal */ process_fpemu_return(sig, fault_addr); - + trace_trap_exit(); return; } else if (fcr31 & FPU_CSR_INV_X) info.si_code = FPE_FLTINV; @@ -748,6 +756,7 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31) info.si_errno = 0; info.si_addr = (void __user *) regs->cp0_epc; force_sig_info(SIGFPE, &info, current); + trace_trap_exit(); } static void do_trap_or_bp(struct pt_regs *regs, unsigned int code, @@ -979,6 +988,8 @@ asmlinkage void do_cpu(struct pt_regs *regs) int status; unsigned long __maybe_unused flags; + trace_trap_entry(regs, CAUSE_EXCCODE(regs->cp0_cause)); + die_if_kernel("do_cpu invoked from kernel context!", regs); cpid = (regs->cp0_cause >> CAUSEB_CE) & 3; @@ -990,8 +1001,10 @@ asmlinkage void do_cpu(struct pt_regs *regs) opcode = 0; status = -1; - if (unlikely(compute_return_epc(regs) < 0)) + if (unlikely(compute_return_epc(regs) < 0)) { + trace_trap_exit(); return; + } if (unlikely(get_user(opcode, epc) < 0)) status = SIGSEGV; @@ -1009,7 +1022,7 @@ asmlinkage void do_cpu(struct pt_regs *regs) regs->cp0_epc = old_epc; /* Undo skip-over. */ force_sig(status, current); } - + trace_trap_exit(); return; case 1: @@ -1029,11 +1042,12 @@ asmlinkage void do_cpu(struct pt_regs *regs) if (!process_fpemu_return(sig, fault_addr)) mt_ase_fp_affinity(); } - + trace_trap_exit(); return; case 2: raw_notifier_call_chain(&cu2_chain, CU2_EXCEPTION, regs); + trace_trap_exit(); return; case 3: @@ -1041,6 +1055,7 @@ asmlinkage void do_cpu(struct pt_regs *regs) } force_sig(SIGILL, current); + trace_trap_exit(); } asmlinkage void do_mdmx(struct pt_regs *regs) diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index cfea1adfa153..d3af94de2402 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -79,6 +79,7 @@ #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/perf_event.h> +#include <trace/trap.h> #include <asm/asm.h> #include <asm/branch.h> @@ -518,6 +519,7 @@ asmlinkage void do_ade(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, 0, regs, regs->cp0_badvaddr); + trace_trap_entry(regs, CAUSE_EXCCODE(regs->cp0_cause)); /* * Did we catch a fault trying to load an instruction? * Or are we running in MIPS16 mode? @@ -543,6 +545,8 @@ asmlinkage void do_ade(struct pt_regs *regs) emulate_load_store_insn(regs, (void __user *)regs->cp0_badvaddr, pc); set_fs(seg); + trace_trap_exit(); + return; sigbus: @@ -552,6 +556,8 @@ sigbus: /* * XXX On return from the signal handler we should advance the epc */ + + trace_trap_exit(); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 137ee76a0045..1a5bd7b90183 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/perf_event.h> +#include <trace/fault.h> #include <asm/branch.h> #include <asm/mmu_context.h> @@ -28,6 +29,9 @@ #include <asm/highmem.h> /* For VMALLOC_END */ #include <linux/kdebug.h> +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -144,7 +148,10 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ + trace_page_fault_entry(regs, CAUSE_EXCCODE(regs->cp0_cause), mm, vma, + address, write); fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + trace_page_fault_exit(fault); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index aa8de727e90b..cfcb2e70eba8 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -62,6 +62,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ #define TIF_SINGLESTEP 9 /* single stepping? */ #define TIF_BLOCKSTEP 10 /* branch stepping? */ +#define TIF_KERNEL_TRACE 11 /* kernel trace active */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -73,6 +74,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7d69e9bf5e64..b13eeab289a6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -117,11 +117,13 @@ config PPC select HAVE_IOREMAP_PROT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_KPROBES + select HAVE_TRACE_CLOCK select HAVE_ARCH_KGDB select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK select HAVE_DMA_ATTRS + select HAVE_GET_CYCLES if PPC64 select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 65eb85976a03..34ba6e0a4d8d 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -100,7 +100,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 4 /* 32 bit binary */ -#define TIF_PERFMON_WORK 5 /* work for pfm_handle_work() */ +#define TIF_KERNEL_TRACE 5 /* kernel trace active */ #define TIF_PERFMON_CTXSW 6 /* perfmon needs ctxsw calls */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ @@ -111,6 +111,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_FREEZE 14 /* Freezing for suspend */ #define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ +#define TIF_PERFMON_WORK 16 /* work for pfm_handle_work() */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -118,7 +119,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_32BIT (1<<TIF_32BIT) -#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) @@ -128,7 +129,8 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_RUNLATCH (1<<TIF_RUNLATCH) -#define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP) +#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) +#define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP|_TIF_KERNEL_TRACE) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ _TIF_NOTIFY_RESUME) diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index c55e14f7ef44..2fe7460cbf9c 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -14,6 +14,8 @@ typedef unsigned long cycles_t; +extern unsigned long tb_ticks_per_sec; + static inline cycles_t get_cycles(void) { #ifdef __powerpc64__ @@ -46,5 +48,15 @@ static inline cycles_t get_cycles(void) #endif } +static inline cycles_t get_cycles_rate(void) +{ + return tb_ticks_per_sec; +} + +static inline void get_cycles_barrier(void) +{ + isync(); +} + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TIMEX_H */ diff --git a/arch/powerpc/include/asm/trace-clock.h b/arch/powerpc/include/asm/trace-clock.h new file mode 100644 index 000000000000..05facc3e372b --- /dev/null +++ b/arch/powerpc/include/asm/trace-clock.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2005,2008 Mathieu Desnoyers + * + * Trace clock PowerPC definitions. + * + * Use get_tb() directly to insure reading a 64-bits value on powerpc 32. + */ + +#ifndef _ASM_TRACE_CLOCK_H +#define _ASM_TRACE_CLOCK_H + +#include <linux/timex.h> +#include <linux/time.h> +#include <asm/time.h> + +static inline u32 trace_clock_read32(void) +{ + return get_tbl(); +} + +static inline u64 trace_clock_read64(void) +{ + return get_tb(); +} + +static inline unsigned int trace_clock_frequency(void) +{ + return get_cycles_rate(); +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +static inline int get_trace_clock(void) +{ + return 0; +} + +static inline void put_trace_clock(void) +{ +} + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_TRACE_CLOCK_H */ diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index cbe2297d68b6..d1c27723f848 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -7,7 +7,7 @@ #include <linux/tracepoint.h> struct pt_regs; - +#if 0 /* disabled by Mathieu Desnoyers. Belongs to generic IRQS. */ TRACE_EVENT(irq_entry, TP_PROTO(struct pt_regs *regs), @@ -41,6 +41,7 @@ TRACE_EVENT(irq_exit, TP_printk("pt_regs=%p", __entry->regs) ); +#endif //0 TRACE_EVENT(timer_interrupt_entry, diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ce557f6f00fc..d21cf5bc5037 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -401,8 +401,6 @@ void do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq; - trace_irq_entry(regs); - irq_enter(); check_stack_overflow(); @@ -425,8 +423,6 @@ void do_IRQ(struct pt_regs *regs) timer_interrupt(regs); } #endif - - trace_irq_exit(regs); } void __init init_IRQ(void) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 094bd9821ad4..8294f73feac2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -665,7 +665,7 @@ _GLOBAL(abs) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(original_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 206a321a71d3..1e10e579922a 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -415,7 +415,7 @@ _GLOBAL(scom970_write) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(original_kernel_thread) std r29,-24(r1) std r30,-16(r1) stdu r1,-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index ef3ef566235e..046b16e4a571 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -161,6 +161,9 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); +#ifndef CONFIG_SPARSE_IRQ +EXPORT_SYMBOL(irq_desc); +#endif EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(cacheable_memcpy); EXPORT_SYMBOL(cacheable_memzero); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8303a6c65ef7..0a85886a7848 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -38,6 +38,7 @@ #include <linux/personality.h> #include <linux/random.h> #include <linux/hw_breakpoint.h> +#include <trace/sched.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -55,6 +56,8 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> +DEFINE_TRACE(sched_kthread_create); + extern unsigned long _get_SP(void); #ifndef CONFIG_SMP @@ -663,6 +666,17 @@ void show_regs(struct pt_regs * regs) show_instructions(regs); } +long original_kernel_thread(int (*fn) (void *), void *arg, unsigned long flags); + +long kernel_thread(int (fn) (void *), void *arg, unsigned long flags) +{ + long retval; + + retval = original_kernel_thread(fn, arg, flags); + trace_sched_kthread_create(fn, retval); + return retval; +} + void exit_thread(void) { discard_lazy_cpu_state(); diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 906536998291..fb8924c5fdff 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -34,12 +34,16 @@ #endif #include <linux/hw_breakpoint.h> #include <linux/perf_event.h> +#include <trace/syscall.h> #include <asm/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/system.h> +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + /* * The parameter save area on the stack is used to store arguments being passed * to callee function and is located at fixed offset from stack pointer. @@ -1680,6 +1684,8 @@ long do_syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + trace_syscall_entry(regs, regs->gpr[0]); + secure_computing(regs->gpr[0]); if (test_thread_flag(TIF_SYSCALL_TRACE) && @@ -1715,6 +1721,8 @@ void do_syscall_trace_leave(struct pt_regs *regs) { int step; + trace_syscall_exit(regs->result); + if (unlikely(current->audit_context)) audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, regs->result); diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 4e5bf1edc0f2..5fe3cb1f38bb 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -41,6 +41,7 @@ #include <linux/elf.h> #include <linux/ipc.h> #include <linux/slab.h> +#include <trace/ipc.h> #include <asm/ptrace.h> #include <asm/types.h> @@ -51,6 +52,7 @@ #include <asm/ppc-pci.h> #include <asm/syscalls.h> +DEFINE_TRACE(ipc_call); asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, @@ -79,6 +81,8 @@ long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t pt version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; + trace_ipc_call(call, first); + switch (call) { case SEMTIMEDOP: diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 09d31dbf43f9..fcbe3f5c0744 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -54,6 +54,7 @@ #include <linux/irq.h> #include <linux/delay.h> #include <linux/irq_work.h> +#include <trace/trap.h> #include <asm/trace.h> #include <asm/io.h> @@ -585,6 +586,8 @@ void timer_interrupt(struct pt_regs * regs) * some CPUs will continuue to take decrementer exceptions */ set_dec(DECREMENTER_MAX); + trace_trap_entry(regs, regs->trap); + #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC) if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); @@ -631,6 +634,7 @@ void timer_interrupt(struct pt_regs * regs) set_irq_regs(old_regs); trace_timer_interrupt_exit(regs); + trace_trap_exit(); } #ifdef CONFIG_SUSPEND diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bd74fac169be..62ae8cad7929 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -34,6 +34,8 @@ #include <linux/bug.h> #include <linux/kdebug.h> #include <linux/debugfs.h> +#include <linux/ltt-core.h> +#include <trace/trap.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> @@ -75,6 +77,12 @@ EXPORT_SYMBOL(__debugger_fault_handler); #endif /* + * Also used in time.c and fault.c. + */ +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + +/* * Trap & Exception support */ @@ -141,6 +149,10 @@ int die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_NUMA printk("NUMA "); #endif +#ifdef CONFIG_LTT + printk("LTT NESTING LEVEL : %u ", __get_cpu_var(ltt_nesting)); + printk("\n"); +#endif printk("%s\n", ppc_md.name ? ppc_md.name : ""); sysfs_printk_last_file(); @@ -204,11 +216,14 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) addr, regs->nip, regs->link, code); } + trace_trap_entry(regs, regs->trap); + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; info.si_addr = (void __user *) addr; force_sig_info(signr, &info, current); + trace_trap_exit(); } #ifdef CONFIG_PPC64 @@ -1087,7 +1102,9 @@ void performance_monitor_exception(struct pt_regs *regs) { __get_cpu_var(irq_stat).pmu_irqs++; + trace_trap_entry(regs, regs->trap); perf_irq(regs); + trace_trap_exit(); } #ifdef CONFIG_8xx @@ -1308,12 +1325,14 @@ void altivec_assist_exception(struct pt_regs *regs) /* got an error reading the instruction */ _exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip); } else { + trace_trap_entry(regs, regs->trap); /* didn't recognize the instruction */ /* XXX quick hack for now: set the non-Java bit in the VSCR */ if (printk_ratelimit()) printk(KERN_ERR "Unrecognized altivec instruction " "in %s at %lx\n", current->comm, regs->nip); current->thread.vscr.u[3] |= 0x10000; + trace_trap_exit(); } } #endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 54f4fb994e99..2fb3d0ce2222 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -31,6 +31,7 @@ #include <linux/kdebug.h> #include <linux/perf_event.h> #include <linux/magic.h> +#include <trace/fault.h> #include <asm/firmware.h> #include <asm/page.h> @@ -43,6 +44,9 @@ #include <asm/siginfo.h> #include <mm/mmu_decl.h> +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); + #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs) { @@ -309,7 +313,9 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ + trace_page_fault_entry(regs, regs->trap, mm, vma, address, is_write); ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); + trace_page_fault_exit(ret); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 3c7c3f82d842..3f0985252596 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -30,6 +30,7 @@ #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/marker.h> #include <asm/io.h> #include <asm/time.h> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 0b0466284932..a7b4b8838632 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -40,6 +40,7 @@ #include <linux/pid_namespace.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/marker.h> #include <asm/io.h> #include <asm/mmu_context.h> diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index ad1382f7932e..637ce13517a0 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -94,6 +94,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ #define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ +#define TIF_KERNEL_TRACE 12 /* kernel trace active */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_31BIT 17 /* 32bit process */ @@ -113,6 +114,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_31BIT (1<<TIF_31BIT) #define _TIF_SINGLE_STEP (1<<TIF_FREEZE) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 648f64239a9d..831874ffbc6d 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,7 +52,8 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8 | \ + _TIF_KERNEL_TRACE>>8) STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER STACK_SIZE = 1 << STACK_SHIFT diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 9d3603d6c511..7b5255e3b94e 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -55,7 +55,8 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8 | \ + _TIF_KERNEL_TRACE>>8) #define BASED(name) name-system_call(%r13) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index ef86ad243986..df0af62a76dc 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -45,6 +45,9 @@ enum s390_regset { REGSET_GENERAL_EXTENDED, }; +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + void update_per_regs(struct task_struct *task) { static const struct per_regs per_single_step = { @@ -723,6 +726,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) /* Do the secure computing check first. */ secure_computing(regs->gprs[2]); + trace_syscall_entry(regs, regs->gprs[2]); /* * The sysc_tracesys code in entry.S stored the system * call number to gprs[2]. @@ -753,6 +757,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { + trace_syscall_exit(regs->gprs[2]); if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index 476081440df9..dcc9d509af08 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -29,9 +29,12 @@ #include <linux/personality.h> #include <linux/unistd.h> #include <linux/ipc.h> +#include <trace/ipc.h> #include <asm/uaccess.h> #include "entry.h" +DEFINE_TRACE(ipc_call); + /* * Perform the mmap() system call. Linux for S/390 isn't able to handle more * than 5 system call parameters, so this system call uses a memory block @@ -70,6 +73,8 @@ SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, struct ipc_kludge tmp; int ret; + trace_ipc_call(call, first); + switch (call) { case SEMOP: return sys_semtimedop(first, (struct sembuf __user *)ptr, diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index b5a4a739b477..ff0dc02adc38 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -5,6 +5,7 @@ * Copyright (C) 1999,2000 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * Portions added by T. Halloran: (C) Copyright 2002 IBM Poughkeepsie, IBM Corporation * * Derived from "arch/i386/kernel/traps.c" * Copyright (C) 1991, 1992 Linus Torvalds @@ -33,6 +34,7 @@ #include <linux/kprobes.h> #include <linux/bug.h> #include <linux/utsname.h> +#include <trace/trap.h> #include <asm/system.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -65,6 +67,12 @@ static int kstack_depth_to_print = 20; #endif /* CONFIG_64BIT */ /* + * Also used in fault.c. + */ +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + +/* * For show_trace we have tree different stack to consider: * - the panic stack which is used if the kernel stack has overflown * - the asynchronous interrupt stack (cpu related) @@ -299,6 +307,8 @@ static inline void __kprobes do_trap(long pgm_int_code, int signr, char *str, pgm_int_code, signr) == NOTIFY_STOP) return; + trace_trap_entry(regs, pgm_int_code & 0xffff); + if (regs->psw.mask & PSW_MASK_PSTATE) { struct task_struct *tsk = current; @@ -314,11 +324,14 @@ static inline void __kprobes do_trap(long pgm_int_code, int signr, char *str, enum bug_trap_type btt; btt = report_bug(regs->psw.addr & PSW_ADDR_INSN, regs); - if (btt == BUG_TRAP_TYPE_WARN) + if (btt == BUG_TRAP_TYPE_WARN) { + trace_trap_exit(); return; + } die(str, regs, pgm_int_code); } } + trace_trap_exit(); } static inline void __user *get_psw_address(struct pt_regs *regs, @@ -422,9 +435,11 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code, location = get_psw_address(regs, pgm_int_code); + trace_trap_entry(regs, pgm_int_code & 0xffff); + if (regs->psw.mask & PSW_MASK_PSTATE) { if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) - return; + goto end; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { if (tracehook_consider_fatal_signal(current, SIGTRAP)) force_sig(SIGTRAP, current); @@ -433,24 +448,24 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code, #ifdef CONFIG_MATHEMU } else if (opcode[0] == 0xb3) { if (get_user(*((__u16 *) (opcode+2)), location+1)) - return; + goto end; signal = math_emu_b3(opcode, regs); } else if (opcode[0] == 0xed) { if (get_user(*((__u32 *) (opcode+2)), (__u32 __user *)(location+1))) - return; + goto end; signal = math_emu_ed(opcode, regs); } else if (*((__u16 *) opcode) == 0xb299) { if (get_user(*((__u16 *) (opcode+2)), location+1)) - return; + goto end; signal = math_emu_srnm(opcode, regs); } else if (*((__u16 *) opcode) == 0xb29c) { if (get_user(*((__u16 *) (opcode+2)), location+1)) - return; + goto end; signal = math_emu_stfpc(opcode, regs); } else if (*((__u16 *) opcode) == 0xb29d) { if (get_user(*((__u16 *) (opcode+2)), location+1)) - return; + goto end; signal = math_emu_lfpc(opcode, regs); #endif } else @@ -486,6 +501,8 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code, do_trap(pgm_int_code, signal, "illegal operation", regs, &info); } +end: + trace_trap_exit(); } @@ -500,6 +517,8 @@ asmlinkage void specification_exception(struct pt_regs *regs, location = (__u16 __user *) get_psw_address(regs, pgm_int_code); + trace_trap_entry(regs, pgm_int_code & 0xffff); + if (regs->psw.mask & PSW_MASK_PSTATE) { get_user(*((__u16 *) opcode), location); switch (opcode[0]) { @@ -544,6 +563,7 @@ asmlinkage void specification_exception(struct pt_regs *regs, do_trap(pgm_int_code, signal, "specification exception", regs, &info); } + trace_trap_exit(); } #else DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, @@ -558,6 +578,8 @@ static void data_exception(struct pt_regs *regs, long pgm_int_code, location = get_psw_address(regs, pgm_int_code); + trace_trap_entry(regs, pgm_int_code & 0xffff); + if (MACHINE_HAS_IEEE) asm volatile("stfpc %0" : "=m" (current->thread.fp_regs.fpc)); @@ -631,6 +653,7 @@ static void data_exception(struct pt_regs *regs, long pgm_int_code, info.si_addr = location; do_trap(pgm_int_code, signal, "data exception", regs, &info); } + trace_trap_exit(); } static void space_switch_exception(struct pt_regs *regs, long pgm_int_code, @@ -638,6 +661,7 @@ static void space_switch_exception(struct pt_regs *regs, long pgm_int_code, { siginfo_t info; + trace_trap_entry(regs, pgm_int_code & 0xffff); /* Set user psw back to home space mode. */ if (regs->psw.mask & PSW_MASK_PSTATE) regs->psw.mask |= PSW_ASC_HOME; @@ -647,6 +671,7 @@ static void space_switch_exception(struct pt_regs *regs, long pgm_int_code, info.si_code = ILL_PRVOPC; info.si_addr = get_psw_address(regs, pgm_int_code); do_trap(pgm_int_code, SIGILL, "space switch event", regs, &info); + trace_trap_exit(); } asmlinkage void __kprobes kernel_stack_overflow(struct pt_regs * regs) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2c57806c0858..f07b4d2cb53a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -5,6 +5,7 @@ * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Hartmut Penner (hp@de.ibm.com) * Ulrich Weigand (uweigand@de.ibm.com) + * Portions added by T. Halloran: (C) Copyright 2002 IBM Poughkeepsie, IBM Corporation * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds @@ -31,6 +32,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <trace/fault.h> #include <asm/asm-offsets.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -39,6 +41,11 @@ #include <asm/compat.h> #include "../kernel/entry.h" +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); +DEFINE_TRACE(page_fault_nosem_entry); +DEFINE_TRACE(page_fault_nosem_exit); + #ifndef CONFIG_64BIT #define __FAIL_ADDR_MASK 0x7ffff000 #define __SUBCODE_MASK 0x0200 @@ -272,7 +279,10 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, /* User mode accesses just cause a SIGSEGV */ si_code = (fault == VM_FAULT_BADMAP) ? SEGV_MAPERR : SEGV_ACCERR; + trace_page_fault_nosem_entry(regs, int_code & 0xffff, + trans_exc_code); do_sigsegv(regs, int_code, si_code, trans_exc_code); + trace_page_fault_nosem_exit(); return; } case VM_FAULT_BADCONTEXT: diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 8a9011dced14..fb4ef6a78dbb 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -48,6 +48,7 @@ config SUPERH32 select PERF_EVENTS select ARCH_HIBERNATION_POSSIBLE if MMU select SPARSE_IRQ + select HAVE_LTT_DUMP_TABLES config SUPERH64 def_bool ARCH = "sh64" @@ -211,6 +212,8 @@ config CPU_SH4 select CPU_HAS_FPU if !CPU_SH4AL_DSP select SYS_SUPPORTS_TMU select SYS_SUPPORTS_HUGETLBFS if MMU + select HAVE_TRACE_CLOCK + select HAVE_TRACE_CLOCK_32_TO_64 config CPU_SH4A bool diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index c228946926ed..2d94b81add33 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -120,6 +120,7 @@ extern void init_thread_xstate(void); #define TIF_SECCOMP 6 /* secure computing */ #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ #define TIF_SYSCALL_TRACEPOINT 8 /* for ftrace syscall instrumentation */ +#define TIF_KERNEL_TRACE 9 /* kernel trace active */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 /* Freezing for suspend */ @@ -132,6 +133,7 @@ extern void init_thread_xstate(void); #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_FREEZE (1 << TIF_FREEZE) @@ -144,17 +146,18 @@ extern void init_thread_xstate(void); /* work to do in syscall trace */ #define _TIF_WORK_SYSCALL_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE) /* work to do on any return to u-space */ #define _TIF_ALLWORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SYSCALL_AUDIT | \ _TIF_SINGLESTEP | _TIF_NOTIFY_RESUME | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ - _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)) + _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ + _TIF_KERNEL_TRACE)) /* * Thread-synchronous status. diff --git a/arch/sh/include/asm/timex.h b/arch/sh/include/asm/timex.h index 18bf06d9c764..5249bee819ca 100644 --- a/arch/sh/include/asm/timex.h +++ b/arch/sh/include/asm/timex.h @@ -12,6 +12,8 @@ * can be used for accurately setting CLOCK_TICK_RATE, otherwise we * simply fall back on the i8253 PIT value. */ + +#if 0 #ifdef CONFIG_SH_PCLK_FREQ #define CLOCK_TICK_RATE (CONFIG_SH_PCLK_FREQ / 4) /* Underlying HZ */ #else @@ -19,5 +21,18 @@ #endif #include <asm-generic/timex.h> +#endif //0 + +#include <linux/io.h> +#include <cpu/timer.h> + +#define CLOCK_TICK_RATE (HZ * 100000UL) + +typedef unsigned long long cycles_t; + +static __inline__ cycles_t get_cycles (void) +{ + return 0xffffffff - ctrl_inl(TMU1_TCNT); +} #endif /* __ASM_SH_TIMEX_H */ diff --git a/arch/sh/include/asm/trace-clock.h b/arch/sh/include/asm/trace-clock.h new file mode 100644 index 000000000000..152d54c41818 --- /dev/null +++ b/arch/sh/include/asm/trace-clock.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2007,2008 Giuseppe Cavallaro <peppe.cavallaro@st.com> + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * Trace clock definitions for SuperH. + */ + +#ifndef _ASM_SH_TRACE_CLOCK_H +#define _ASM_SH_TRACE_CLOCK_H + +#include <linux/clocksource.h> +#include <asm/clock.h> + +/* + * Number of hardware clock bits. The higher order bits are expected to be 0. + * If the hardware clock source has more than 32 bits, the bits higher than the + * 32nd will be truncated by a cast to a 32 bits unsigned. Range : 1 - 32. + * (too few bits would be unrealistic though, since we depend on the timer to + * detect the overflows). + */ +#define TC_HW_BITS 32 + +/* Expected maximum interrupt latency in ms : 15ms, *2 for security */ +#define TC_EXPECTED_INTERRUPT_LATENCY 30 + +extern u64 trace_clock_read_synthetic_tsc(void); +extern u64 sh_get_clock_frequency(void); +extern u32 sh_read_timer_count(void); +extern void get_synthetic_tsc(void); +extern void put_synthetic_tsc(void); + +static inline u32 trace_clock_read32(void) +{ + return sh_read_timer_count(); +} + +static inline u64 trace_clock_read64(void) +{ + return trace_clock_read_synthetic_tsc(); +} + +static inline u64 trace_clock_frequency(void) +{ + return sh_get_clock_frequency(); +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +static inline int get_trace_clock(void) +{ + get_synthetic_tsc(); + return 0; +} + +static inline void put_trace_clock(void) +{ + put_synthetic_tsc(); +} + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_SH_TRACE_CLOCK_H */ diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile index 77f7ae1d4647..fcb0da93c42f 100644 --- a/arch/sh/kernel/Makefile +++ b/arch/sh/kernel/Makefile @@ -47,5 +47,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += localtimer.o +obj-$(CONFIG_HAVE_TRACE_CLOCK) += trace-clock.o ccflags-y := -Werror diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 762a13984bbd..ddffe37d9686 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -21,12 +21,15 @@ #include <linux/fs.h> #include <linux/ftrace.h> #include <linux/hw_breakpoint.h> +#include <trace/sched.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/system.h> #include <asm/fpu.h> #include <asm/syscalls.h> +DEFINE_TRACE(sched_kthread_create); + void show_regs(struct pt_regs * regs) { printk("\n"); @@ -94,6 +97,8 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + trace_sched_kthread_create(fn, pid); + return pid; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 210c1cabcb7f..4b17fd9ed799 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -25,12 +25,15 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/io.h> +#include <trace/sched.h> #include <asm/syscalls.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> #include <asm/fpu.h> +DEFINE_TRACE(sched_kthread_create); + struct task_struct *last_task_used_math = NULL; void show_regs(struct pt_regs *regs) @@ -300,6 +303,7 @@ ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) */ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { + int pid; struct pt_regs regs; memset(®s, 0, sizeof(regs)); @@ -310,8 +314,12 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.sr = (1 << 30); /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, + pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + + trace_sched_kthread_create(fn, pid); + + return pid; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 90a15d29feeb..0373238e0d53 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -26,6 +26,10 @@ #include <linux/elf.h> #include <linux/regset.h> #include <linux/hw_breakpoint.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/marker.h> +#include <trace/syscall.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -33,10 +37,34 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> #include <asm/fpu.h> +#include <asm/unistd.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + +extern unsigned long sys_call_table[]; +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < NR_syscalls; i++) { + sprint_symbol(namebuf, sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i, (void *)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); + +void ltt_dump_idt_table(void *call_data) +{ +} +EXPORT_SYMBOL_GPL(ltt_dump_idt_table); + /* * This routine will get a word off of the process kernel stack. */ @@ -491,6 +519,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + trace_syscall_entry(regs, regs->regs[3]); + secure_computing(regs->regs[0]); if (test_thread_flag(TIF_SYSCALL_TRACE) && @@ -517,6 +547,8 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; + trace_syscall_exit(regs->regs[0]); + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), regs->regs[0]); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index 4436eacddb15..c893d20483b4 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -31,6 +31,7 @@ #include <linux/tracehook.h> #include <linux/elf.h> #include <linux/regset.h> +#include <trace/syscall.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -43,6 +44,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + /* This mask defines the bits of the SR which the user is not allowed to change, which are everything except S, Q, M, PR, SZ, FR. */ #define SR_MASK (0xffff8cfd) diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index 8c6a350df751..b519b22b575f 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -27,6 +27,9 @@ #include <asm/unistd.h> #include <asm/cacheflush.h> #include <asm/cachectl.h> +#include <trace/ipc.h> + +DEFINE_TRACE(ipc_call); asmlinkage int old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, diff --git a/arch/sh/kernel/trace-clock.c b/arch/sh/kernel/trace-clock.c new file mode 100644 index 000000000000..0c5509b96678 --- /dev/null +++ b/arch/sh/kernel/trace-clock.c @@ -0,0 +1,55 @@ +/* + * arch/sh/kernel/trace-clock.c + * + * Trace clock for SuperH. + * + * Copyright (C) 2010 STMicroelectronics Ltd + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Note: currently only tested and supported on SH4 CPU + * (TODO: tests on other SuperH architectures). + */ + +#include <linux/module.h> +#include <linux/clocksource.h> +#include <asm/clock.h> + +static struct clocksource *clksrc; + +/* In case of the TMU, for SH4 architectures, it returns + * the value of timer counter register (TCNT). */ +u32 sh_read_timer_count(void) +{ + u32 value = 0; + + if (likely(clksrc)) + value = (u32) clksrc->read(clksrc); + + return value; +} + +/* Get the clock rate for the timer (e.g. TMU for SH4) */ +u64 sh_get_clock_frequency(void) +{ + u64 rate = 0; + struct clk *clk; + + clk = clk_get(NULL, "module_clk"); + if (likely(clk)) + rate = clk_get_rate(clk) / 4; + + return rate; +} + +/* Get the clock source needed to read the timer counter. + * For example a TMU channel for SH4 architectures. */ +static __init int init_sh_clocksource(void) +{ + clksrc = clocksource_get_next(); + if (unlikely(!clksrc)) + pr_err("%s: no clocksource found\n", __func__); + + return 0; +} +early_initcall(init_sh_clocksource); diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 3484c2f65aba..5abd87752eb6 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -27,6 +27,7 @@ #include <linux/sysfs.h> #include <linux/uaccess.h> #include <linux/perf_event.h> +#include <trace/trap.h> #include <asm/system.h> #include <asm/alignment.h> #include <asm/fpu.h> @@ -47,6 +48,9 @@ #define TRAP_ILLEGAL_SLOT_INST 13 #endif +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + static void dump_mem(const char *str, unsigned long bottom, unsigned long top) { unsigned long p; @@ -545,6 +549,8 @@ asmlinkage void do_address_error(struct pt_regs *regs, error_code = lookup_exception_vector(); #endif + trace_trap_entry(regs, error_code >> 5); + oldfs = get_fs(); if (user_mode(regs)) { @@ -589,8 +595,10 @@ fixup: address); set_fs(oldfs); - if (tmp == 0) + if (!tmp) { + trace_trap_exit(); return; /* sorted */ + } uspace_segv: printk(KERN_NOTICE "Sending SIGBUS to \"%s\" due to unaligned " "access (PC %lx PR %lx)\n", current->comm, regs->pc, @@ -623,6 +631,7 @@ uspace_segv: 0, address); set_fs(oldfs); } + trace_trap_exit(); } #ifdef CONFIG_SH_DSP diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index d4c34d757f0d..226957634033 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -16,11 +16,17 @@ #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/perf_event.h> +#include <trace/fault.h> #include <asm/io_trapped.h> #include <asm/system.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); +DEFINE_TRACE(page_fault_nosem_entry); +DEFINE_TRACE(page_fault_nosem_exit); + static inline int notify_page_fault(struct pt_regs *regs, int trap) { int ret = 0; @@ -200,7 +206,14 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ + trace_page_fault_entry(regs, + ({ + unsigned long trapnr; + asm volatile("stc r2_bank,%0": "=r" (trapnr)); + trapnr; + }) >> 5, mm, vma, address, writeaccess); fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); + trace_page_fault_exit(fault); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -230,11 +243,18 @@ bad_area: bad_area_nosemaphore: if (user_mode(regs)) { + trace_page_fault_nosem_entry(regs, + ({ + unsigned long trapnr; + asm volatile("stc r2_bank,%0": "=r" (trapnr)); + trapnr; + }) >> 5, address); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void *) address; force_sig_info(SIGSEGV, &info, tsk); + trace_page_fault_nosem_exit(); return; } @@ -324,6 +344,11 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long writeaccess, pmd_t *pmd; pte_t *pte; pte_t entry; + int ret; + int irqvec; + + irqvec = lookup_exception_vector(); + trace_page_fault_nosem_entry(regs, irqvec, address); /* * We don't take page faults for P1, P2, and parts of P4, these @@ -333,24 +358,34 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long writeaccess, if (address >= P3SEG && address < P3_ADDR_MAX) { pgd = pgd_offset_k(address); } else { - if (unlikely(address >= TASK_SIZE || !current->mm)) - return 1; + if (unlikely(address >= TASK_SIZE || !current->mm)) { + ret = 1; + goto out; + } pgd = pgd_offset(current->mm, address); } pud = pud_offset(pgd, address); - if (pud_none_or_clear_bad(pud)) - return 1; + if (pud_none_or_clear_bad(pud)) { + ret = 1; + goto out; + } pmd = pmd_offset(pud, address); - if (pmd_none_or_clear_bad(pmd)) - return 1; + if (pmd_none_or_clear_bad(pmd)) { + ret = 1; + goto out; + } pte = pte_offset_kernel(pmd, address); entry = *pte; - if (unlikely(pte_none(entry) || pte_not_present(entry))) - return 1; - if (unlikely(writeaccess && !pte_write(entry))) - return 1; + if (unlikely(pte_none(entry) || pte_not_present(entry))) { + ret = 1; + goto out; + } + if (unlikely(writeaccess && !pte_write(entry))) { + ret = 1; + goto out; + } if (writeaccess) entry = pte_mkdirty(entry); @@ -370,5 +405,8 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long writeaccess, update_mmu_cache(NULL, address, pte); - return 0; + ret = 0; +out: + trace_page_fault_nosem_exit(); + return ret; } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 95695e97703e..76f95707b976 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -40,6 +40,7 @@ config SPARC64 select HAVE_KPROBES select HAVE_MEMBLOCK select HAVE_SYSCALL_WRAPPERS + select HAVE_GET_CYCLES select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS @@ -50,6 +51,7 @@ config SPARC64 select RTC_DRV_STARFIRE select HAVE_PERF_EVENTS select PERF_USE_VMALLOC + select HAVE_TRACE_CLOCK select HAVE_GENERIC_HARDIRQS config ARCH_DEFCONFIG diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 9dd0318d3ddf..654899022445 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -128,6 +128,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ +#define TIF_KERNEL_TRACE 5 /* kernel trace active */ #define TIF_USEDFPU 8 /* FPU was used by this task * this quantum (SMP) */ #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling @@ -137,6 +138,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index fb2ea7705a46..9de58956acec 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -214,7 +214,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ /* flag bit 6 is available */ #define TIF_32BIT 7 /* 32-bit binary */ -/* flag bit 8 is available */ +#define TIF_KERNEL_TRACE 8 /* kernel trace active */ #define TIF_SECCOMP 9 /* secure computing */ #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ @@ -233,6 +233,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_UNALIGNED (1<<TIF_UNALIGNED) #define _TIF_32BIT (1<<TIF_32BIT) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) diff --git a/arch/sparc/include/asm/timex_64.h b/arch/sparc/include/asm/timex_64.h index 18b30bc9823b..905443a88894 100644 --- a/arch/sparc/include/asm/timex_64.h +++ b/arch/sparc/include/asm/timex_64.h @@ -12,7 +12,24 @@ /* Getting on the cycle counter on sparc64. */ typedef unsigned long cycles_t; -#define get_cycles() tick_ops->get_tick() + +static inline cycles_t get_cycles(void) +{ + return tick_ops->get_tick(); +} + +/* get_cycles instruction is synchronized on sparc64 */ +static inline void get_cycles_barrier(void) +{ + return; +} + +extern unsigned long tb_ticks_per_usec; + +static inline cycles_t get_cycles_rate(void) +{ + return (cycles_t)tb_ticks_per_usec * 1000000UL; +} #define ARCH_HAS_READ_CURRENT_TIMER diff --git a/arch/sparc/include/asm/trace-clock.h b/arch/sparc/include/asm/trace-clock.h new file mode 100644 index 000000000000..306fdf7b7ba0 --- /dev/null +++ b/arch/sparc/include/asm/trace-clock.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2008, Mathieu Desnoyers + * + * Trace clock definitions for Sparc64. + */ + +#ifndef _ASM_SPARC_TRACE_CLOCK_H +#define _ASM_SPARC_TRACE_CLOCK_H + +#include <linux/timex.h> + +static inline u32 trace_clock_read32(void) +{ + return get_cycles(); +} + +static inline u64 trace_clock_read64(void) +{ + return get_cycles(); +} + +static inline unsigned int trace_clock_frequency(void) +{ + return get_cycles_rate(); +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +static inline int get_trace_clock(void) +{ + return 0; +} + +static inline void put_trace_clock(void) +{ +} + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_SPARC_TRACE_CLOCK_H */ diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S index 1504df8ddf70..54210d48dbad 100644 --- a/arch/sparc/kernel/entry.S +++ b/arch/sparc/kernel/entry.S @@ -1151,7 +1151,7 @@ sys_sigreturn: add %sp, STACKFRAME_SZ, %o0 ld [%curptr + TI_FLAGS], %l5 - andcc %l5, _TIF_SYSCALL_TRACE, %g0 + andcc %l5, (_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE), %g0 be 1f nop @@ -1171,7 +1171,7 @@ sys_rt_sigreturn: add %sp, STACKFRAME_SZ, %o0 ld [%curptr + TI_FLAGS], %l5 - andcc %l5, _TIF_SYSCALL_TRACE, %g0 + andcc %l5, (_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE), %g0 be 1f nop @@ -1313,7 +1313,7 @@ syscall_is_too_hard: ld [%curptr + TI_FLAGS], %l5 mov %i3, %o3 - andcc %l5, _TIF_SYSCALL_TRACE, %g0 + andcc %l5, (_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE), %g0 mov %i4, %o4 bne linux_syscall_trace mov %i0, %l5 @@ -1330,7 +1330,7 @@ ret_sys_call: ld [%sp + STACKFRAME_SZ + PT_PSR], %g3 set PSR_C, %g2 bgeu 1f - andcc %l6, _TIF_SYSCALL_TRACE, %g0 + andcc %l6, (_TIF_SYSCALL_TRACE|_TIF_KERNEL_TRACE), %g0 /* System call success, clear Carry condition code. */ andn %g3, %g2, %g3 diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 17529298c50a..57c0d67b5c4c 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -24,6 +24,7 @@ #include <linux/pm.h> #include <linux/init.h> #include <linux/slab.h> +#include <trace/sched.h> #include <asm/auxio.h> #include <asm/oplib.h> @@ -39,6 +40,8 @@ #include <asm/prom.h> #include <asm/unistd.h> +DEFINE_TRACE(sched_kthread_create); + /* * Power management idle function * Set in pm platform drivers (apc.c and pmc.c) @@ -674,6 +677,7 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) "i" (__NR_clone), "r" (flags | CLONE_VM | CLONE_UNTRACED), "i" (__NR_exit), "r" (fn), "r" (arg) : "g1", "g2", "g3", "o0", "o1", "memory", "cc"); + trace_sched_kthread_create(fn, retval); return retval; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index 3bc9c9979b92..d57935a02459 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -782,7 +782,8 @@ static struct clocksource clocksource_tick = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static unsigned long tb_ticks_per_usec __read_mostly; +unsigned long tb_ticks_per_usec __read_mostly; +EXPORT_SYMBOL_GPL(tb_ticks_per_usec); void __delay(unsigned long loops) { diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index e2cf786bda0a..510bf5ba9641 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -68,6 +68,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 6 #define TIF_RESTORE_SIGMASK 7 +#define TIF_KERNEL_TRACE 8 /* kernel trace active */ #define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) @@ -77,6 +78,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_FREEZE (1 << TIF_FREEZE) #endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aad..b0389519b6de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,7 @@ config X86 select HAVE_READQ select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_GET_CYCLES select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS @@ -27,9 +28,11 @@ config X86 select HAVE_KPROBES select HAVE_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB + select HAVE_LTT_DUMP_TABLES select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_TRACE_CLOCK select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_C_RECORDMCOUNT @@ -208,10 +211,12 @@ config HAVE_INTEL_TXT config X86_32_SMP def_bool y depends on X86_32 && SMP + select HAVE_UNSYNCHRONIZED_TSC config X86_64_SMP def_bool y depends on X86_64 && SMP + select HAVE_UNSYNCHRONIZED_TSC config X86_HT def_bool y diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index 29cdcd02ead3..accd6b42bd2c 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -8,8 +8,11 @@ #include <linux/shm.h> #include <linux/ipc.h> #include <linux/compat.h> +#include <trace/ipc.h> #include <asm/sys_ia32.h> +DEFINE_TRACE(ipc_call); + asmlinkage long sys32_ipc(u32 call, int first, int second, int third, compat_uptr_t ptr, u32 fifth) { @@ -18,6 +21,8 @@ asmlinkage long sys32_ipc(u32 call, int first, int second, int third, version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; + trace_ipc_call(call, first); + switch (call) { case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 38d87379e270..9b1db108f9ec 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -1,20 +1,9 @@ #ifndef _ASM_X86_IDLE_H #define _ASM_X86_IDLE_H -#define IDLE_START 1 -#define IDLE_END 2 - -struct notifier_block; -void idle_notifier_register(struct notifier_block *n); -void idle_notifier_unregister(struct notifier_block *n); - -#ifdef CONFIG_X86_64 -void enter_idle(void); -void exit_idle(void); -#else /* !CONFIG_X86_64 */ -static inline void enter_idle(void) { } -static inline void exit_idle(void) { } -#endif /* CONFIG_X86_64 */ +extern void enter_idle(void); +extern void __exit_idle(void); +extern void exit_idle(void); void c1e_remove_cpu(int cpu); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 5745ce8bf108..fdf897373e19 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -56,6 +56,61 @@ static inline void native_halt(void) #endif +#ifdef CONFIG_X86_64 +/* + * Only returns from a trap or exception to a NMI context (intra-privilege + * level near return) to the same SS and CS segments. Should be used + * upon trap or exception return when nested over a NMI context so no iret is + * issued. It takes care of modifying the eflags, rsp and returning to the + * previous function. + * + * The stack, at that point, looks like : + * + * 0(rsp) RIP + * 8(rsp) CS + * 16(rsp) EFLAGS + * 24(rsp) RSP + * 32(rsp) SS + * + * Upon execution : + * Copy EIP to the top of the return stack + * Update top of return stack address + * Pop eflags into the eflags register + * Make the return stack current + * Near return (popping the return address from the return stack) + */ +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushq %rax; \ + movq %rsp, %rax; \ + movq 24+8(%rax), %rsp; \ + pushq 0+8(%rax); \ + pushq 16+8(%rax); \ + movq (%rax), %rax; \ + popfq; \ + ret +#else +/* + * Protected mode only, no V8086. Implies that protected mode must + * be entered before NMIs or MCEs are enabled. Only returns from a trap or + * exception to a NMI context (intra-privilege level far return). Should be used + * upon trap or exception return when nested over a NMI context so no iret is + * issued. + * + * The stack, at that point, looks like : + * + * 0(esp) EIP + * 4(esp) CS + * 8(esp) EFLAGS + * + * Upon execution : + * Copy the stack eflags to top of stack + * Pop eflags into the eflags register + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS. + */ +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \ + popfl; \ + lret $4 +#endif + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -112,6 +167,7 @@ static inline unsigned long arch_local_irq_save(void) #define ENABLE_INTERRUPTS(x) sti #define DISABLE_INTERRUPTS(x) cli +#define INTERRUPT_RETURN_NMI_SAFE NATIVE_INTERRUPT_RETURN_NMI_SAFE #ifdef CONFIG_X86_64 #define SWAPGS swapgs diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/include/asm/kvm-mmutrace.h index b60b4fdb3eda..42d117d0418f 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/include/asm/kvm-mmutrace.h @@ -217,9 +217,9 @@ TRACE_EVENT( #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH asm #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace +#define TRACE_INCLUDE_FILE kvm-mmutrace /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/trace.h b/arch/x86/include/asm/kvm-trace.h index 1357d7cf4ec8..c1e151c092b2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/include/asm/kvm-trace.h @@ -701,9 +701,9 @@ TRACE_EVENT(kvm_emulate_insn, #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_PATH asm #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace +#define TRACE_INCLUDE_FILE kvm-trace /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ebbc4d8ab170..1ef6906c179e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -962,6 +962,10 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) +#define INTERRUPT_RETURN_NMI_SAFE \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE, \ + jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return) + #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c869..3e0634cc1270 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -181,6 +181,7 @@ struct pv_cpu_ops { /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); + void (*nmi_return)(void); void (*swapgs)(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a0..58a37ae7565b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -82,6 +82,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_KERNEL_TRACE 9 /* kernel trace active */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -105,6 +106,7 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -121,18 +123,19 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_KERNEL_TRACE) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ (0x0000FFFF & \ ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) + _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU|_TIF_KERNEL_TRACE)) /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h new file mode 100644 index 000000000000..8ca73323366c --- /dev/null +++ b/arch/x86/include/asm/trace-clock.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +/* + * linux/arch/x86/include/asm/trace-clock.h + * + * Copyright (C) 2005,2006,2008 + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Trace clock definitions for x86. + */ + +#include <linux/timex.h> +#include <linux/time.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +/* Minimum duration of a probe, in cycles */ +#define TRACE_CLOCK_MIN_PROBE_DURATION 200 +#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION + +union lttng_timespec { + struct timespec ts; + u64 lttng_ts; +}; + +extern cycles_t trace_clock_async_tsc_read(void); + +extern int _trace_clock_is_sync; +static inline int trace_clock_is_sync(void) +{ + return _trace_clock_is_sync; +} + +static inline u32 trace_clock_read32(void) +{ + u32 cycles; + + if (likely(trace_clock_is_sync())) + cycles = (u32)get_cycles(); /* only need the 32 LSB */ + else + cycles = (u32)trace_clock_async_tsc_read(); + return cycles; +} + +static inline u64 trace_clock_read64(void) +{ + u64 cycles; + + if (likely(trace_clock_is_sync())) + cycles = get_cycles(); + else + cycles = trace_clock_async_tsc_read(); + return cycles; +} + +static inline u64 trace_clock_frequency(void) +{ + return (u64)cpu_khz * 1000; +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +extern int get_trace_clock(void); +extern void put_trace_clock(void); + +extern void set_trace_clock_is_sync(int state); + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 1ca132fc0d03..28e56e1ec3c0 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,18 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +static inline cycles_t get_cycles_rate(void) +{ + if (check_tsc_unstable()) + return 0; + return (cycles_t)tsc_khz * 1000; +} + +static inline void get_cycles_barrier(void) +{ + rdtsc_barrier(); +} + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: @@ -62,4 +74,10 @@ extern int notsc_setup(char *); extern void save_sched_clock_state(void); extern void restore_sched_clock_state(void); +extern int test_tsc_synchronization(void); +extern int _tsc_is_sync; +static inline int tsc_is_sync(void) +{ + return _tsc_is_sync; +} #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e204826f..06abe8f409a6 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -12,6 +12,7 @@ struct vsyscall_gtod_data { u32 wall_time_nsec; int sysctl_enabled; + int trace_clock_is_sync; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d0983d255fbd..47b80f3ba4d6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -39,6 +39,14 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +#ifdef CONFIG_X86_64 +extern void update_trace_clock_is_sync_vdso(void); +#else +static inline void update_trace_clock_is_sync_vdso(void) +{ +} +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34244b2cd880..717cf9c620bf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,6 +46,7 @@ obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-$(CONFIG_HAVE_TRACE_CLOCK) += trace-clock.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -66,9 +67,8 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 76b96d74978a..c604d23b4f34 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -33,6 +33,7 @@ #include <linux/dmi.h> #include <linux/smp.h> #include <linux/mm.h> +#include <trace/irq.h> #include <asm/perf_event.h> #include <asm/x86_init.h> @@ -868,7 +869,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) */ exit_idle(); irq_enter(); + trace_irq_entry(LOCAL_TIMER_VECTOR, regs, NULL); local_apic_timer_interrupt(); + trace_irq_exit(IRQ_HANDLED); irq_exit(); set_irq_regs(old_regs); @@ -1788,6 +1791,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); + trace_irq_entry(SPURIOUS_APIC_VECTOR, NULL, NULL); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1802,6 +1806,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); + trace_irq_exit(IRQ_HANDLED); irq_exit(); } @@ -1814,6 +1819,7 @@ void smp_error_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); + trace_irq_entry(ERROR_APIC_VECTOR, NULL, NULL); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1834,6 +1840,7 @@ void smp_error_interrupt(struct pt_regs *regs) */ pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); + trace_irq_exit(IRQ_HANDLED); irq_exit(); } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0e4f24c2a746..60939d5f226c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/idle.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -235,6 +236,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/idle.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -947,10 +949,17 @@ recalc: break; } } + enter_idle(); if (original_pm_idle) original_pm_idle(); else default_idle(); + /* + * In many cases the interrupt that ended idle + * has already called exit_idle. But some idle + * loops can be woken up without interrupt. + */ + __exit_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37a..677f8475d9de 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -111,6 +111,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return); OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd965..1aea11cd8404 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -58,6 +58,7 @@ int main(void) OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396bd..6052f6f65a6b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1069,6 +1069,7 @@ unsigned long kernel_eflags; * debugging, no special alignment required. */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +EXPORT_PER_CPU_SYMBOL_GPL(orig_ist); #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97f..c8a6411d8baa 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <trace/irq.h> #include <asm/processor.h> #include <asm/system.h> @@ -402,8 +403,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { exit_idle(); irq_enter(); + trace_irq_entry(THERMAL_APIC_VECTOR, regs, NULL); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); + trace_irq_exit(IRQ_HANDLED); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index df20723a6a1b..6bed23e1c748 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,6 +15,7 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> +#include <linux/ltt-core.h> #include <asm/stacktrace.h> @@ -253,6 +254,8 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (!signr) return; + if (in_nmi()) + panic("Fatal exception in non-maskable interrupt"); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) @@ -277,6 +280,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); +#ifdef CONFIG_LTT + printk(KERN_EMERG "LTT NESTING LEVEL : %u", __get_cpu_var(ltt_nesting)); + printk("\n"); +#endif sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c8b4efad7ebb..2fae6c570fd4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -80,6 +80,8 @@ #define nr_syscalls ((syscall_table_size)/4) +#define NMI_MASK 0x04000000 + #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -321,8 +323,32 @@ END(ret_from_fork) # userspace resumption stub bypassing syscall exit tracing ALIGN RING0_PTREGS_FRAME + ret_from_exception: preempt_stop(CLBR_ANY) + GET_THREAD_INFO(%ebp) + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jae resume_userspace # returning to v8086 or userspace + testl $NMI_MASK,TI_preempt_count(%ebp) + jz resume_kernel /* Not nested over NMI ? */ + testw $X86_EFLAGS_TF, PT_EFLAGS(%esp) + jnz resume_kernel /* + * If single-stepping an NMI handler, + * use the normal iret path instead of + * the popf/lret because lret would be + * single-stepped. It should not + * happen : it will reactivate NMIs + * prematurely. + */ + TRACE_IRQS_IRET + RESTORE_REGS + addl $4, %esp # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 + INTERRUPT_RETURN_NMI_SAFE + ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -906,6 +932,10 @@ ENTRY(native_iret) .previous END(native_iret) +ENTRY(native_nmi_return) + NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ? +END(native_nmi_return) + ENTRY(native_irq_enable_sysexit) sti sysexit diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed1ffbeb0c9..c841c0fb5cce 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -163,6 +163,8 @@ GLOBAL(return_to_handler) #endif +#define NMI_MASK 0x04000000 + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -515,6 +517,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + testl $_TIF_KERNEL_TRACE,%edx /* Re-read : concurrently changed */ + jnz ret_from_sys_call_trace bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -524,6 +528,16 @@ sysret_careful: popq_cfi %rdi jmp sysret_check +ret_from_sys_call_trace: + TRACE_IRQS_ON + sti + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp int_ret_from_sys_call + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON @@ -872,6 +886,9 @@ ENTRY(native_iret) .section __ex_table,"a" .quad native_iret, bad_iret .previous + +ENTRY(native_nmi_return) + NATIVE_INTERRUPT_RETURN_NMI_SAFE #endif .section .fixup,"ax" @@ -924,6 +941,24 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule + /* Returning to kernel space from exception. */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retexc_kernel) + testl $NMI_MASK,TI_preempt_count(%rcx) + jz retint_kernel /* Not nested over NMI ? */ + testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */ + jnz retint_kernel /* + * If single-stepping an NMI handler, + * use the normal iret path instead of + * the popf/lret because lret would be + * single-stepped. It should not + * happen : it will reactivate NMIs + * prematurely. + */ + RESTORE_ARGS 0,8,0 + TRACE_IRQS_IRETQ + INTERRUPT_RETURN_NMI_SAFE + #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ @@ -1361,12 +1396,18 @@ ENTRY(paranoid_exit) paranoid_swapgs: TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK +paranoid_restore_no_nmi: RESTORE_ALL 8 jmp irq_return paranoid_restore: + GET_THREAD_INFO(%rcx) TRACE_IRQS_IRETQ 0 + testl $NMI_MASK,TI_preempt_count(%rcx) + jz paranoid_restore_no_nmi /* Nested over NMI ? */ + testw $X86_EFLAGS_TF,EFLAGS-0(%rsp) /* trap flag? */ + jnz paranoid_restore_no_nmi RESTORE_ALL 8 - jmp irq_return + INTERRUPT_RETURN_NMI_SAFE paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx @@ -1465,7 +1506,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jne retexc_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71b..1fc5da98373c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -156,6 +156,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) @@ -204,6 +205,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); +extern void native_nmi_return(void); extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); @@ -373,6 +375,7 @@ struct pv_cpu_ops pv_cpu_ops = { .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, + .nmi_return = native_nmi_return, .swapgs = native_swapgs, .set_iopl_mask = native_set_iopl_mask, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..ac372778bbc0 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -1,10 +1,13 @@ -#include <asm/paravirt.h> +#include <linux/stringify.h> +#include <linux/irqflags.h> DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, nmi_return, + __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE)); DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); @@ -41,6 +44,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, nmi_return); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34f93eb..5339e67dc153 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -1,12 +1,15 @@ +#include <linux/irqflags.h> +#include <linux/stringify.h> #include <asm/paravirt.h> #include <asm/asm-offsets.h> -#include <linux/stringify.h> DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_cpu_ops, iret, "iretq"); +DEF_NATIVE(pv_cpu_ops, nmi_return, + __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE)); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -51,6 +54,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, nmi_return); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff4554198981..e0e4ffcad481 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -13,6 +13,7 @@ #include <linux/dmi.h> #include <linux/utsname.h> #include <trace/events/power.h> +#include <trace/sched.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> #include <asm/system.h> @@ -23,6 +24,8 @@ #include <asm/i387.h> #include <asm/debugreg.h> +DEFINE_TRACE(sched_kthread_create); + struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -278,6 +281,7 @@ extern void kernel_thread_helper(void); int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; + long pid; memset(®s, 0, sizeof(regs)); @@ -299,7 +303,10 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, + 0, ®s, 0, NULL, NULL); + trace_sched_kthread_create(fn, pid); + return pid; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..3a8c9ee0fc6e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,9 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/notifier.h> +#include <linux/idle.h> +#include <trace/pm.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -59,6 +62,38 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +DEFINE_TRACE(pm_idle_exit); +DEFINE_TRACE(pm_idle_entry); + +static DEFINE_PER_CPU(unsigned char, is_idle); + +void enter_idle(void) +{ + percpu_write(is_idle, 1); + trace_pm_idle_entry(); + notify_idle(IDLE_START); +} +EXPORT_SYMBOL_GPL(enter_idle); + +void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + notify_idle(IDLE_END); + trace_pm_idle_exit(); +} +EXPORT_SYMBOL_GPL(__exit_idle); + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +EXPORT_SYMBOL_GPL(exit_idle); + /* * Return saved PC of a blocked thread. */ @@ -107,10 +142,18 @@ void cpu_idle(void) play_dead(); local_irq_disable(); + enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); pm_idle(); start_critical_timings(); + /* + * In many cases the interrupt that ended idle + * has already called exit_idle. But some idle + * loops can be woken up without interrupt. + */ + __exit_idle(); + trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..b21b379013e9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -35,8 +35,10 @@ #include <linux/tick.h> #include <linux/prctl.h> #include <linux/uaccess.h> +#include <linux/idle.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <trace/pm.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -51,37 +53,36 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> +#include <trace/events/power.h> + +DEFINE_TRACE(pm_idle_exit); +DEFINE_TRACE(pm_idle_entry); + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - void enter_idle(void) { percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); + /* + * Trace last event before calling notifiers. Notifiers flush + * data from buffers before going to idle. + */ + trace_pm_idle_entry(); + notify_idle(IDLE_START); } +EXPORT_SYMBOL_GPL(enter_idle); -static void __exit_idle(void) +void __exit_idle(void) { if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); + notify_idle(IDLE_END); + trace_pm_idle_exit(); } +EXPORT_SYMBOL_GPL(__exit_idle); /* Called from interrupts to signify idle end */ void exit_idle(void) @@ -91,6 +92,7 @@ void exit_idle(void) return; __exit_idle(); } +EXPORT_SYMBOL_GPL(exit_idle); #ifndef CONFIG_SMP static inline void play_dead(void) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc4b72a..ee3024d4f61e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> +#include <trace/syscall.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -152,6 +153,9 @@ static const int arg_offs_table[] = { X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + /* * Determines whether a value may be installed in a segment register. */ @@ -1361,6 +1365,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) if (test_thread_flag(TIF_SINGLESTEP)) regs->flags |= X86_EFLAGS_TF; + trace_syscall_entry(regs, regs->orig_ax); + /* do the secure computing check first */ secure_computing(regs->orig_ax); @@ -1396,6 +1402,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) { bool step; + trace_syscall_exit(regs->ax); + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d6008295..5e74f6aa3c0a 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -1,8 +1,11 @@ /* System call table for x86-64. */ #include <linux/linkage.h> +#include <linux/module.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/marker.h> +#include <linux/kallsyms.h> #include <asm/asm-offsets.h> #define __NO_STUBS @@ -27,3 +30,18 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/unistd_64.h> }; + +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < __NR_syscall_max + 1; i++) { + sprint_symbol(namebuf, (unsigned long)sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, + call_data, + "id %d address %p symbol %s", + i, (void*)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c new file mode 100644 index 000000000000..47539e28276a --- /dev/null +++ b/arch/x86/kernel/trace-clock.c @@ -0,0 +1,302 @@ +/* + * arch/x86/kernel/trace-clock.c + * + * Trace clock for x86. + * + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>, October 2008 + */ + +#include <linux/module.h> +#include <linux/trace-clock.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <linux/cpu.h> +#include <linux/posix-timers.h> +#include <asm/vgtod.h> + +static cycles_t trace_clock_last_tsc; +static DEFINE_PER_CPU(struct timer_list, update_timer); +static DEFINE_SPINLOCK(async_tsc_lock); +static int async_tsc_refcount; /* Number of readers */ +static int async_tsc_enabled; /* Async TSC enabled on all online CPUs */ + +int _trace_clock_is_sync = 1; +EXPORT_SYMBOL_GPL(_trace_clock_is_sync); + +/* + * Is the trace clock being used by user-space ? We leave the trace clock active + * as soon as user-space starts using it. We never unref the trace clock + * reference taken by user-space. + */ +static atomic_t user_trace_clock_ref; + +/* + * Called by check_tsc_sync_source from CPU hotplug. + */ +void set_trace_clock_is_sync(int state) +{ + _trace_clock_is_sync = state; + update_trace_clock_is_sync_vdso(); +} + +#if BITS_PER_LONG == 64 +static cycles_t read_last_tsc(void) +{ + return trace_clock_last_tsc; +} +#else +/* + * A cmpxchg64 update can happen concurrently. Based on the assumption that + * two cmpxchg64 will never update it to the same value (the count always + * increases), reading it twice insures that we read a coherent value with the + * same "sequence number". + */ +static cycles_t read_last_tsc(void) +{ + cycles_t val1, val2; + + val1 = trace_clock_last_tsc; + for (;;) { + val2 = val1; + barrier(); + val1 = trace_clock_last_tsc; + if (likely(val1 == val2)) + break; + } + return val1; +} +#endif + +/* + * Support for architectures with non-sync TSCs. + * When the local TSC is discovered to lag behind the highest TSC counter, we + * increment the TSC count of an amount that should be, ideally, lower than the + * execution time of this routine, in cycles : this is the granularity we look + * for : we must be able to order the events. + */ +notrace cycles_t trace_clock_async_tsc_read(void) +{ + cycles_t new_tsc, last_tsc; + + WARN_ON(!async_tsc_refcount || !async_tsc_enabled); + new_tsc = get_cycles(); + last_tsc = read_last_tsc(); + do { + if (new_tsc < last_tsc) + new_tsc = last_tsc + TRACE_CLOCK_MIN_PROBE_DURATION; + /* + * If cmpxchg fails with a value higher than the new_tsc, don't + * retry : the value has been incremented and the events + * happened almost at the same time. + * We must retry if cmpxchg fails with a lower value : + * it means that we are the CPU with highest frequency and + * therefore MUST update the value. + */ + last_tsc = cmpxchg64(&trace_clock_last_tsc, last_tsc, new_tsc); + } while (unlikely(last_tsc < new_tsc)); + return new_tsc; +} +EXPORT_SYMBOL_GPL(trace_clock_async_tsc_read); + +static void update_timer_ipi(void *info) +{ + (void)trace_clock_async_tsc_read(); +} + +/* + * update_timer_fct : - Timer function to resync the clocks + * @data: unused + * + * Fires every jiffy. + */ +static void update_timer_fct(unsigned long data) +{ + (void)trace_clock_async_tsc_read(); + mod_timer_pinned(&per_cpu(update_timer, smp_processor_id()), + jiffies + 1); +} + +static void enable_trace_clock(int cpu) +{ + init_timer(&per_cpu(update_timer, cpu)); + per_cpu(update_timer, cpu).function = update_timer_fct; + per_cpu(update_timer, cpu).expires = jiffies + 1; + smp_call_function_single(cpu, update_timer_ipi, NULL, 1); + add_timer_on(&per_cpu(update_timer, cpu), cpu); +} + +static void disable_trace_clock(int cpu) +{ + del_timer_sync(&per_cpu(update_timer, cpu)); +} + +/* + * hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + int cpu; + + spin_lock(&async_tsc_lock); + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + /* + * trace_clock_is_sync() is updated by set_trace_clock_is_sync() + * code, protected by cpu hotplug disable. + * It is ok to let the hotplugged CPU read the timebase before + * the CPU_ONLINE notification. It's just there to give a + * maximum bound to the TSC error. + */ + if (async_tsc_refcount && !trace_clock_is_sync()) { + if (!async_tsc_enabled) { + async_tsc_enabled = 1; + for_each_online_cpu(cpu) + enable_trace_clock(cpu); + } else { + enable_trace_clock(hotcpu); + } + } + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* + * We cannot stop the trace clock on other CPUs when readers are + * active even if we go back to a synchronized state (1 CPU) + * because the CPU left could be the one lagging behind. + */ + if (async_tsc_refcount && async_tsc_enabled) + disable_trace_clock(hotcpu); + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + spin_unlock(&async_tsc_lock); + + return NOTIFY_OK; +} + +int get_trace_clock(void) +{ + int cpu; + + if (!trace_clock_is_sync()) { + printk(KERN_WARNING + "Trace clock falls back on cache-line bouncing\n" + "workaround due to non-synchronized TSCs.\n" + "This workaround preserves event order across CPUs.\n" + "Please consider disabling Speedstep or PowerNow and\n" + "using kernel parameters " + "\"force_tsc_sync=1 idle=poll\"\n" + "for accurate and fast tracing clock source.\n"); + } + + get_online_cpus(); + spin_lock(&async_tsc_lock); + if (async_tsc_refcount++ || trace_clock_is_sync()) + goto end; + + async_tsc_enabled = 1; + for_each_online_cpu(cpu) + enable_trace_clock(cpu); +end: + spin_unlock(&async_tsc_lock); + put_online_cpus(); + return 0; +} +EXPORT_SYMBOL_GPL(get_trace_clock); + +void put_trace_clock(void) +{ + int cpu; + + get_online_cpus(); + spin_lock(&async_tsc_lock); + WARN_ON(async_tsc_refcount <= 0); + if (async_tsc_refcount != 1 || !async_tsc_enabled) + goto end; + + for_each_online_cpu(cpu) + disable_trace_clock(cpu); + async_tsc_enabled = 0; +end: + async_tsc_refcount--; + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + spin_unlock(&async_tsc_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(put_trace_clock); + +static int posix_get_trace(clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + int ret; + + /* + * Yes, there is a race here that would lead to refcount being + * incremented more than once, but all we care is to leave the trace + * clock active forever, so precise accounting is not needed. + */ + if (unlikely(!atomic_read(&user_trace_clock_ref))) { + ret = get_trace_clock(); + if (ret) + return ret; + atomic_inc(&user_trace_clock_ref); + } + lts->lttng_ts = trace_clock_read64(); + return 0; +} + +static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + + lts->lttng_ts = trace_clock_frequency(); + return 0; +} + +static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + + lts->lttng_ts = TRACE_CLOCK_RES; + return 0; +} + +static __init int init_unsync_trace_clock(void) +{ + struct k_clock clock_trace = { + .clock_getres = posix_get_trace_res, + .clock_get = posix_get_trace, + }; + struct k_clock clock_trace_freq = { + .clock_getres = posix_get_trace_res, + .clock_get = posix_get_trace_freq, + }; + + register_posix_clock(CLOCK_TRACE, &clock_trace); + register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq); + + hotcpu_notifier(hotcpu_callback, 4); + return 0; +} +early_initcall(init_unsync_trace_clock); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b67166f9de..bc618945fb1f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <trace/trap.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -52,6 +53,7 @@ #include <asm/atomic.h> #include <asm/system.h> #include <asm/traps.h> +#include <asm/unistd.h> #include <asm/desc.h> #include <asm/i387.h> #include <asm/mce.h> @@ -76,11 +78,21 @@ char ignore_fpu_irq; * F0 0F bug workaround. */ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; + +extern unsigned long sys_call_table[]; +extern unsigned long syscall_table_size; + #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); +/* + * Also used in arch/x86/mm/fault.c. + */ +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + static int ignore_nmis; int unknown_nmi_panic; @@ -122,6 +134,8 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; + trace_trap_entry(regs, trapnr); + #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* @@ -168,7 +182,7 @@ trap_signal: force_sig_info(signr, info, tsk); else force_sig(signr, tsk); - return; + goto end; kernel_trap: if (!fixup_exception(regs)) { @@ -176,15 +190,17 @@ kernel_trap: tsk->thread.trap_no = trapnr; die(str, regs, error_code); } - return; + goto end; #ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; - return; + goto end; #endif +end: + trace_trap_exit(); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -285,7 +301,9 @@ do_general_protection(struct pt_regs *regs, long error_code) printk("\n"); } + trace_trap_entry(regs, 13); force_sig(SIGSEGV, tsk); + trace_trap_exit(); return; #ifdef CONFIG_X86_32 @@ -398,13 +416,15 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + trace_trap_entry(regs, 2); + /* * CPU-specific NMI must be processed before non-CPU-specific * NMI, otherwise we may lose it, because the CPU-specific * NMI can not be detected/processed on other CPUs. */ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; + goto end; /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ raw_spin_lock(&nmi_reason_lock); @@ -423,11 +443,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); #endif raw_spin_unlock(&nmi_reason_lock); - return; + goto end; } raw_spin_unlock(&nmi_reason_lock); unknown_nmi_error(reason, regs); +end: + trace_trap_exit(); } dotraplinkage notrace __kprobes void @@ -570,8 +592,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { + trace_trap_entry(regs, 1); handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + trace_trap_exit(); preempt_conditional_cli(regs); return; } @@ -589,13 +613,32 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) regs->flags &= ~X86_EFLAGS_TF; } si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) { + trace_trap_entry(regs, 1); send_sigtrap(tsk, regs, error_code, si_code); + trace_trap_exit(); + } preempt_conditional_cli(regs); return; } +#ifdef CONFIG_X86_32 +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < NR_syscalls; i++) { + sprint_symbol(namebuf, sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i, (void*)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -701,11 +744,13 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + trace_trap_entry(regs, 16); conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif + trace_trap_exit(); } asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) @@ -738,6 +783,21 @@ void __math_state_restore(void) tsk->fpu_counter++; } +void ltt_dump_idt_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < IDT_ENTRIES; i++) { + unsigned long address = gate_offset(idt_table[i]); + sprint_symbol(namebuf, address); + __trace_mark(0, irq_state, idt_table, call_data, + "irq %d address %p symbol %s", + i, (void *)address, namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_idt_table); + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c deleted file mode 100644 index 0aa5fed8b9e6..000000000000 --- a/arch/x86/kernel/tsc_sync.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * check TSC synchronization. - * - * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar - * - * We check whether all boot CPUs have their TSC's synchronized, - * print a warning if not and turn off the TSC clock-source. - * - * The warp-check is point-to-point between two CPUs, the CPU - * initiating the bootup is the 'source CPU', the freshly booting - * CPU is the 'target CPU'. - * - * Only two CPUs may participate - they can enter in any order. - * ( The serial nature of the boot logic and the CPU hotplug lock - * protects against more than 2 CPUs entering this code. ) - */ -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/nmi.h> -#include <asm/tsc.h> - -/* - * Entry/exit counters that make sure that both CPUs - * run the measurement code at once: - */ -static __cpuinitdata atomic_t start_count; -static __cpuinitdata atomic_t stop_count; - -/* - * We use a raw spinlock in this exceptional case, because - * we want to have the fastest, inlined, non-debug version - * of a critical section, to be able to prove TSC time-warps: - */ -static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; - -static __cpuinitdata cycles_t last_tsc; -static __cpuinitdata cycles_t max_warp; -static __cpuinitdata int nr_warps; - -/* - * TSC-warp measurement loop running on both CPUs: - */ -static __cpuinit void check_tsc_warp(void) -{ - cycles_t start, now, prev, end; - int i; - - rdtsc_barrier(); - start = get_cycles(); - rdtsc_barrier(); - /* - * The measurement runs for 20 msecs: - */ - end = start + tsc_khz * 20ULL; - now = start; - - for (i = 0; ; i++) { - /* - * We take the global lock, measure TSC, save the - * previous TSC that was measured (possibly on - * another CPU) and update the previous TSC timestamp. - */ - arch_spin_lock(&sync_lock); - prev = last_tsc; - rdtsc_barrier(); - now = get_cycles(); - rdtsc_barrier(); - last_tsc = now; - arch_spin_unlock(&sync_lock); - - /* - * Be nice every now and then (and also check whether - * measurement is done [we also insert a 10 million - * loops safety exit, so we dont lock up in case the - * TSC readout is totally broken]): - */ - if (unlikely(!(i & 7))) { - if (now > end || i > 10000000) - break; - cpu_relax(); - touch_nmi_watchdog(); - } - /* - * Outside the critical section we can now see whether - * we saw a time-warp of the TSC going backwards: - */ - if (unlikely(prev > now)) { - arch_spin_lock(&sync_lock); - max_warp = max(max_warp, prev - now); - nr_warps++; - arch_spin_unlock(&sync_lock); - } - } - WARN(!(now-start), - "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", - now-start, end-start); -} - -/* - * Source CPU calls into this - it waits for the freshly booted - * target CPU to arrive and then starts the measurement: - */ -void __cpuinit check_tsc_sync_source(int cpu) -{ - int cpus = 2; - - /* - * No need to check if we already know that the TSC is not - * synchronized: - */ - if (unsynchronized_tsc()) - return; - - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - - /* - * Reset it - in case this is a second bootup: - */ - atomic_set(&stop_count, 0); - - /* - * Wait for the target to arrive: - */ - while (atomic_read(&start_count) != cpus-1) - cpu_relax(); - /* - * Trigger the target to continue into the measurement too: - */ - atomic_inc(&start_count); - - check_tsc_warp(); - - while (atomic_read(&stop_count) != cpus-1) - cpu_relax(); - - if (nr_warps) { - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", - smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); - mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); - } - - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; - - /* - * Let the target continue with the bootup: - */ - atomic_inc(&stop_count); -} - -/* - * Freshly booted CPUs call into this: - */ -void __cpuinit check_tsc_sync_target(void) -{ - int cpus = 2; - - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) - return; - - /* - * Register this CPU's participation and wait for the - * source CPU to start the measurement: - */ - atomic_inc(&start_count); - while (atomic_read(&start_count) != cpus) - cpu_relax(); - - check_tsc_warp(); - - /* - * Ok, we are done: - */ - atomic_inc(&stop_count); - - /* - * Wait for the source CPU to print stuff: - */ - while (atomic_read(&stop_count) != cpus) - cpu_relax(); -} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c4b694..df18f14c473b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,6 +44,8 @@ #include <asm/desc.h> #include <asm/topology.h> #include <asm/vgtod.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #define __vsyscall(nr) \ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, + .trace_clock_is_sync = 1, }; void update_vsyscall_tz(void) @@ -73,6 +76,16 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } +void update_trace_clock_is_sync_vdso(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} +EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso); + void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { @@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac4..f894af174b8c 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,7 +32,7 @@ #include "irq.h" #include <linux/kvm_host.h> -#include "trace.h" +#include <asm/kvm-trace.h> static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d3653..58bcbce5b02a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,7 +36,7 @@ #include <asm/atomic.h> #include "kvm_cache_regs.h" #include "irq.h" -#include "trace.h" +#include <asm/kvm-trace.h> #include "x86.h" #ifndef CONFIG_X86_64 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d44..3612044ed1f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -163,7 +163,7 @@ module_param(oos_shadow, bool, 0644); #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS -#include "mmutrace.h" +#include <asm/kvm-mmutrace.h> #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 63fec1531e89..b14429dda248 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,7 +34,7 @@ #include <asm/kvm_para.h> #include <asm/virtext.h> -#include "trace.h" +#include <asm/kvm-trace.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb82..d12b42e234b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ #include <asm/i387.h> #include <asm/xcr.h> -#include "trace.h" +#include <asm/kvm-trace.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcc0efce85bf..6a8cb6fe5c12 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -47,7 +47,7 @@ #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS -#include "trace.h" +#include <asm/kvm-trace.h> #include <asm/debugreg.h> #include <asm/msr.h> diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0cc0c..07f7a2722260 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1276,6 +1276,7 @@ __init void lguest_init(void) pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; pv_cpu_ops.iret = lguest_iret; + pv_cpu_ops.nmi_return = lguest_iret; pv_cpu_ops.load_sp0 = lguest_load_sp0; pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d90ceb882a4..00309849aa16 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -12,6 +12,7 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <trace/fault.h> #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -35,6 +36,11 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); +DEFINE_TRACE(page_fault_nosem_entry); +DEFINE_TRACE(page_fault_nosem_exit); + /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: @@ -720,6 +726,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; + trace_page_fault_nosem_entry(regs, 14, address); if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -729,6 +736,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); + trace_page_fault_nosem_exit(); return; } @@ -1124,7 +1132,9 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ + trace_page_fault_entry(regs, 14, mm, vma, address, write); fault = handle_mm_fault(mm, vma, address, flags); + trace_page_fault_exit(fault); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8f..14b9317eccb8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/cpu.h> +#include <trace/irq.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -141,6 +142,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &flush_state[sender]; + trace_irq_entry(sender, regs, NULL); + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) goto out; /* @@ -167,6 +170,7 @@ out: cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); smp_mb__after_clear_bit(); inc_irq_stat(irq_tlb_count); + trace_irq_exit(IRQ_HANDLED); } static void flush_tlb_others_ipi(const struct cpumask *cpumask, diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754cc3c5..7bc481508d00 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,8 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #include "vextern.h" #define gtod vdso_vsyscall_gtod_data @@ -111,6 +113,46 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) return 0; } +/* + * If the TSC is synchronized across all CPUs, read the current TSC + * and export its value in the nsec field of the timespec + */ +notrace static noinline int do_trace_clock(struct timespec *ts) +{ + unsigned long seq; + union lttng_timespec *lts = (union lttng_timespec *) ts; + + do { + seq = read_seqbegin(>od->lock); + if (unlikely(!gtod->trace_clock_is_sync)) + return vdso_fallback_gettime(CLOCK_TRACE, ts); + /* + * We don't protect the rdtsc with the rdtsc_barrier because + * we can't obtain with tracing that level of precision. + * The operation of recording an event is not atomic therefore + * the small chance of imprecision doesn't justify the overhead + * of a barrier. + */ + /* + * TODO: check that vget_cycles(), using paravirt ops, will + * match the TSC read by get_cycles() at the kernel level. + */ + lts->lttng_ts = vget_cycles(); + } while (unlikely(read_seqretry(>od->lock, seq))); + + return 0; +} + +/* + * Returns the cpu_khz, it needs to be a syscall because we can't access + * this value from userspace and it will only be called at the beginning + * of the tracing session + */ +notrace static noinline int do_trace_clock_freq(struct timespec *ts) +{ + return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts); +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled)) @@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) return do_realtime_coarse(ts); case CLOCK_MONOTONIC_COARSE: return do_monotonic_coarse(ts); + case CLOCK_TRACE: + return do_trace_clock(ts); + case CLOCK_TRACE_FREQ: + return do_trace_clock_freq(ts); + default: + return -EINVAL; } return vdso_fallback_gettime(clock, ts); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45fb..e3839c74ec4e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -974,6 +974,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, + .nmi_return = xen_iret, .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 7be8accb0b0c..a380dcf32a51 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_IRET 4 /* return with iret */ #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ +#define TIF_KERNEL_TRACE 7 /* kernel trace active */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_FREEZE 17 /* is freezing for suspend */ @@ -139,11 +140,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) #define _TIF_IRET (1<<TIF_IRET) +#define _TIF_KERNEL_TRACE (1<<TIF_KERNEL_TRACE) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_FREEZE (1<<TIF_FREEZE) -#define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK 0x0000FF7E /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ /* diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c index c976285d313e..f7a7d69c3a61 100644 --- a/drivers/idle/i7300_idle.c +++ b/drivers/idle/i7300_idle.c @@ -27,6 +27,7 @@ #include <linux/debugfs.h> #include <linux/stop_machine.h> #include <linux/i7300_idle.h> +#include <linux/idle.h> #include <asm/idle.h> @@ -584,7 +585,7 @@ static int __init i7300_idle_init(void) } } - idle_notifier_register(&i7300_idle_nb); + register_idle_notifier(&i7300_idle_nb); printk(KERN_INFO "i7300_idle: loaded v%s\n", I7300_IDLE_DRIVER_VERSION); return 0; @@ -592,7 +593,7 @@ static int __init i7300_idle_init(void) static void __exit i7300_idle_exit(void) { - idle_notifier_unregister(&i7300_idle_nb); + unregister_idle_notifier(&i7300_idle_nb); free_cpumask_var(idle_cpumask); if (debugfs_dir) { diff --git a/drivers/input/input.c b/drivers/input/input.c index 11905b6a3023..38930b01cb2e 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -218,6 +218,9 @@ static void input_handle_event(struct input_dev *dev, { int disposition = INPUT_IGNORE_EVENT; + trace_mark(input, input_event, + "type %u code %u value %d", type, code, value); + switch (type) { case EV_SYN: diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c index a3ea27ce04f2..33e920aa3a73 100644 --- a/drivers/net/wan/hd64570.c +++ b/drivers/net/wan/hd64570.c @@ -104,7 +104,7 @@ static inline u16 desc_abs_number(port_t *port, u16 desc, int transmit) } -static inline u16 desc_offset(port_t *port, u16 desc, int transmit) +static inline u16 hd_desc_offset(port_t *port, u16 desc, int transmit) { /* Descriptor offset always fits in 16 bits */ return desc_abs_number(port, desc, transmit) * sizeof(pkt_desc); @@ -116,10 +116,10 @@ static inline pkt_desc __iomem *desc_address(port_t *port, u16 desc, { #ifdef PAGE0_ALWAYS_MAPPED return (pkt_desc __iomem *)(win0base(port_to_card(port)) - + desc_offset(port, desc, transmit)); + + hd_desc_offset(port, desc, transmit)); #else return (pkt_desc __iomem *)(winbase(port_to_card(port)) - + desc_offset(port, desc, transmit)); + + hd_desc_offset(port, desc, transmit)); #endif } @@ -169,7 +169,7 @@ static void sca_init_port(port_t *port) for (i = 0; i < buffs; i++) { pkt_desc __iomem *desc = desc_address(port, i, transmit); - u16 chain_off = desc_offset(port, i + 1, transmit); + u16 chain_off = hd_desc_offset(port, i + 1, transmit); u32 buff_off = buffer_offset(port, i, transmit); writew(chain_off, &desc->cp); @@ -187,12 +187,12 @@ static void sca_init_port(port_t *port) /* current desc addr */ sca_out(0, dmac + CPB, card); /* pointer base */ - sca_outw(desc_offset(port, 0, transmit), dmac + CDAL, card); + sca_outw(hd_desc_offset(port, 0, transmit), dmac + CDAL, card); if (!transmit) - sca_outw(desc_offset(port, buffs - 1, transmit), + sca_outw(hd_desc_offset(port, buffs - 1, transmit), dmac + EDAL, card); else - sca_outw(desc_offset(port, 0, transmit), dmac + EDAL, + sca_outw(hd_desc_offset(port, 0, transmit), dmac + EDAL, card); /* clear frame end interrupt counter */ @@ -305,7 +305,7 @@ static inline void sca_rx_intr(port_t *port) dev->stats.rx_over_errors++; while (1) { - u32 desc_off = desc_offset(port, port->rxin, 0); + u32 desc_off = hd_desc_offset(port, port->rxin, 0); pkt_desc __iomem *desc; u32 cda = sca_inw(dmac + CDAL, card); @@ -359,7 +359,7 @@ static inline void sca_tx_intr(port_t *port) while (1) { pkt_desc __iomem *desc; - u32 desc_off = desc_offset(port, port->txlast, 1); + u32 desc_off = hd_desc_offset(port, port->txlast, 1); u32 cda = sca_inw(dmac + CDAL, card); if ((cda >= desc_off) && (cda < desc_off + sizeof(pkt_desc))) break; /* Transmitter is/will_be sending this frame */ @@ -660,7 +660,7 @@ static netdev_tx_t sca_xmit(struct sk_buff *skb, struct net_device *dev) writeb(ST_TX_EOM, &desc->stat); port->txin = next_desc(port, port->txin, 1); - sca_outw(desc_offset(port, port->txin, 1), + sca_outw(hd_desc_offset(port, port->txin, 1), get_dmac_tx(port) + EDAL, card); sca_out(DSR_DE, DSR_TX(phy_node(port)), card); /* Enable TX DMA */ diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c index e305274f83fb..5c801ea8cc06 100644 --- a/drivers/net/wan/hd64572.c +++ b/drivers/net/wan/hd64572.c @@ -87,7 +87,7 @@ static inline u16 desc_abs_number(port_t *port, u16 desc, int transmit) } -static inline u16 desc_offset(port_t *port, u16 desc, int transmit) +static inline u16 hd_desc_offset(port_t *port, u16 desc, int transmit) { /* Descriptor offset always fits in 16 bits */ return desc_abs_number(port, desc, transmit) * sizeof(pkt_desc); @@ -98,7 +98,7 @@ static inline pkt_desc __iomem *desc_address(port_t *port, u16 desc, int transmit) { return (pkt_desc __iomem *)(port->card->rambase + - desc_offset(port, desc, transmit)); + hd_desc_offset(port, desc, transmit)); } @@ -143,7 +143,7 @@ static void sca_init_port(port_t *port) for (i = 0; i < buffs; i++) { pkt_desc __iomem *desc = desc_address(port, i, transmit); - u16 chain_off = desc_offset(port, i + 1, transmit); + u16 chain_off = hd_desc_offset(port, i + 1, transmit); u32 buff_off = buffer_offset(port, i, transmit); writel(chain_off, &desc->cp); @@ -162,11 +162,11 @@ static void sca_init_port(port_t *port) sca_out(DCR_ABORT, DCR_TX(port->chan), card); /* current desc addr */ - sca_outl(desc_offset(port, 0, 0), dmac_rx + CDAL, card); - sca_outl(desc_offset(port, card->tx_ring_buffers - 1, 0), + sca_outl(hd_desc_offset(port, 0, 0), dmac_rx + CDAL, card); + sca_outl(hd_desc_offset(port, card->tx_ring_buffers - 1, 0), dmac_rx + EDAL, card); - sca_outl(desc_offset(port, 0, 1), dmac_tx + CDAL, card); - sca_outl(desc_offset(port, 0, 1), dmac_tx + EDAL, card); + sca_outl(hd_desc_offset(port, 0, 1), dmac_tx + CDAL, card); + sca_outl(hd_desc_offset(port, 0, 1), dmac_tx + EDAL, card); /* clear frame end interrupt counter */ sca_out(DCR_CLEAR_EOF, DCR_RX(port->chan), card); @@ -249,7 +249,7 @@ static inline int sca_rx_done(port_t *port, int budget) dev->stats.rx_over_errors++; while (received < budget) { - u32 desc_off = desc_offset(port, port->rxin, 0); + u32 desc_off = hd_desc_offset(port, port->rxin, 0); pkt_desc __iomem *desc; u32 cda = sca_inl(dmac + CDAL, card); @@ -590,7 +590,7 @@ static netdev_tx_t sca_xmit(struct sk_buff *skb, struct net_device *dev) writeb(ST_TX_EOM, &desc->stat); port->txin = (port->txin + 1) % card->tx_ring_buffers; - sca_outl(desc_offset(port, port->txin, 1), + sca_outl(hd_desc_offset(port, port->txin, 1), get_dmac_tx(port) + EDAL, card); sca_out(DSR_DE, DSR_TX(port->chan), card); /* Enable TX DMA */ diff --git a/fs/buffer.c b/fs/buffer.c index 2219a76e2caf..5d0c2c6045c8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,11 +41,15 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <trace/fs.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) +DEFINE_TRACE(fs_buffer_wait_start); +DEFINE_TRACE(fs_buffer_wait_end); + inline void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) { @@ -90,7 +94,9 @@ EXPORT_SYMBOL(unlock_buffer); */ void __wait_on_buffer(struct buffer_head * bh) { + trace_fs_buffer_wait_start(bh); wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); + trace_fs_buffer_wait_end(bh); } EXPORT_SYMBOL(__wait_on_buffer); diff --git a/fs/compat.c b/fs/compat.c index f6fd0a00e6cc..7b0389465cd2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -50,6 +50,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <trace/fs.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1529,6 +1530,7 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; + trace_fs_exec(filename); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/exec.c b/fs/exec.c index 52a447d9b6ab..9a92bbe142d3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -55,6 +55,7 @@ #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> +#include <trace/fs.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -77,6 +78,11 @@ static atomic_t call_count = ATOMIC_INIT(1); static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); +/* + * Also used in compat.c. + */ +DEFINE_TRACE(fs_exec); + int __register_binfmt(struct linux_binfmt * fmt, int insert) { if (!fmt) @@ -1447,6 +1453,7 @@ int do_execve(const char * filename, if (retval < 0) goto out; + trace_fs_exec(filename); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/ioctl.c b/fs/ioctl.c index 1eebeb72b202..a1fecf33b110 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -15,9 +15,12 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> +#include <trace/fs.h> #include <asm/ioctls.h> +DEFINE_TRACE(fs_ioctl); + /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) @@ -616,6 +619,8 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) if (!filp) goto out; + trace_fs_ioctl(fd, cmd, arg); + error = security_file_ioctl(filp, cmd, arg); if (error) goto out_fput; diff --git a/fs/open.c b/fs/open.c index 5a2c6ebc22b5..d29d5eb3c432 100644 --- a/fs/open.c +++ b/fs/open.c @@ -30,9 +30,13 @@ #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> +#include <trace/fs.h> #include "internal.h" +DEFINE_TRACE(fs_open); +DEFINE_TRACE(fs_close); + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { @@ -898,6 +902,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) fsnotify_open(f); fd_install(fd, f); } + trace_fs_open(fd, tmp); } putname(tmp); } @@ -987,6 +992,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) filp = fdt->fd[fd]; if (!filp) goto out_unlock; + trace_fs_close(fd); rcu_assign_pointer(fdt->fd[fd], NULL); FD_CLR(fd, fdt->close_on_exec); __put_unused_fd(files, fd); diff --git a/fs/read_write.c b/fs/read_write.c index 5520f8ad5504..6a3f7f9c9db6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> +#include <trace/fs.h> #include "read_write.h" #include <asm/uaccess.h> @@ -30,6 +31,15 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); +DEFINE_TRACE(fs_lseek); +DEFINE_TRACE(fs_llseek); +DEFINE_TRACE(fs_read); +DEFINE_TRACE(fs_write); +DEFINE_TRACE(fs_pread64); +DEFINE_TRACE(fs_pwrite64); +DEFINE_TRACE(fs_readv); +DEFINE_TRACE(fs_writev); + static inline int unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; @@ -187,6 +197,9 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } + + trace_fs_lseek(fd, offset, origin); + fput_light(file, fput_needed); bad: return retval; @@ -214,6 +227,8 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, origin); + trace_fs_llseek(fd, offset, origin); + retval = (int)offset; if (offset >= 0) { retval = -EFAULT; @@ -409,6 +424,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) if (file) { loff_t pos = file_pos_read(file); ret = vfs_read(file, buf, count, &pos); + trace_fs_read(fd, buf, count, ret); file_pos_write(file, pos); fput_light(file, fput_needed); } @@ -427,6 +443,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, if (file) { loff_t pos = file_pos_read(file); ret = vfs_write(file, buf, count, &pos); + trace_fs_write(fd, buf, count, ret); file_pos_write(file, pos); fput_light(file, fput_needed); } @@ -447,8 +464,11 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, file = fget_light(fd, &fput_needed); if (file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PREAD) + if (file->f_mode & FMODE_PREAD) { ret = vfs_read(file, buf, count, &pos); + trace_fs_pread64(fd, buf, count, pos, ret); + } + fput_light(file, fput_needed); } @@ -476,8 +496,10 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, file = fget_light(fd, &fput_needed); if (file) { ret = -ESPIPE; - if (file->f_mode & FMODE_PWRITE) + if (file->f_mode & FMODE_PWRITE) { ret = vfs_write(file, buf, count, &pos); + trace_fs_pwrite64(fd, buf, count, pos, ret); + } fput_light(file, fput_needed); } @@ -736,6 +758,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, if (file) { loff_t pos = file_pos_read(file); ret = vfs_readv(file, vec, vlen, &pos); + trace_fs_readv(fd, vec, vlen, ret); file_pos_write(file, pos); fput_light(file, fput_needed); } @@ -757,6 +780,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, if (file) { loff_t pos = file_pos_read(file); ret = vfs_writev(file, vec, vlen, &pos); + trace_fs_writev(fd, vec, vlen, ret); file_pos_write(file, pos); fput_light(file, fput_needed); } diff --git a/fs/select.c b/fs/select.c index e56560d2b08a..64c2404f2cc0 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> +#include <trace/fs.h> #include <asm/uaccess.h> @@ -98,6 +99,9 @@ struct poll_table_page { #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) +DEFINE_TRACE(fs_select); +DEFINE_TRACE(fs_poll); + /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to @@ -112,6 +116,9 @@ struct poll_table_page { */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); +static void __pollwait_exclusive(struct file *filp, + wait_queue_head_t *wait_address, + poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { @@ -152,6 +159,20 @@ void poll_freewait(struct poll_wqueues *pwq) } EXPORT_SYMBOL(poll_freewait); +/** + * poll_wait_set_exclusive - set poll wait queue to exclusive + * + * Sets up a poll wait queue to use exclusive wakeups. This is useful to + * wake up only one waiter at each wakeup. Used to work-around "thundering herd" + * problem. + */ +void poll_wait_set_exclusive(poll_table *p) +{ + if (p) + init_poll_funcptr(p, __pollwait_exclusive); +} +EXPORT_SYMBOL(poll_wait_set_exclusive); + static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; @@ -213,8 +234,10 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) } /* Add a new entry */ -static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, - poll_table *p) +static void __pollwait_common(struct file *filp, + wait_queue_head_t *wait_address, + poll_table *p, + int exclusive) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); @@ -226,7 +249,23 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, entry->key = p->key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; - add_wait_queue(wait_address, &entry->wait); + if (!exclusive) + add_wait_queue(wait_address, &entry->wait); + else + add_wait_queue_exclusive(wait_address, &entry->wait); +} + +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, + poll_table *p) +{ + __pollwait_common(filp, wait_address, p, 0); +} + +static void __pollwait_exclusive(struct file *filp, + wait_queue_head_t *wait_address, + poll_table *p) +{ + __pollwait_common(filp, wait_address, p, 1); } int poll_schedule_timeout(struct poll_wqueues *pwq, int state, @@ -450,6 +489,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) file = fget_light(i, &fput_needed); if (file) { f_op = file->f_op; + trace_fs_select(i, end_time); mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { wait_key_set(wait, in, out, bit); @@ -739,6 +779,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) file = fget_light(fd, &fput_needed); mask = POLLNVAL; if (file != NULL) { + trace_fs_poll(fd); mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) { if (pwait) diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c95..691c84baf4f9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -819,3 +819,47 @@ struct hlist_node *seq_hlist_next_rcu(void *v, return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); + +struct list_head *seq_sorted_list_start(struct list_head *head, loff_t *ppos) +{ + struct list_head *lh; + + list_for_each(lh, head) + if ((unsigned long)lh >= *ppos) { + *ppos = (unsigned long)lh; + return lh; + } + return NULL; +} +EXPORT_SYMBOL(seq_sorted_list_start); + +struct list_head *seq_sorted_list_start_head(struct list_head *head, + loff_t *ppos) +{ + struct list_head *lh; + + if (!*ppos) { + *ppos = (unsigned long)head; + return head; + } + list_for_each(lh, head) + if ((unsigned long)lh >= *ppos) { + *ppos = (long)lh->prev; + return lh->prev; + } + return NULL; +} +EXPORT_SYMBOL(seq_sorted_list_start_head); + +struct list_head *seq_sorted_list_next(void *p, struct list_head *head, + loff_t *ppos) +{ + struct list_head *lh; + void *next; + + lh = ((struct list_head *)p)->next; + next = (lh == head) ? NULL : lh; + *ppos = next ? ((unsigned long)next) : (-1UL); + return next; +} +EXPORT_SYMBOL(seq_sorted_list_next); diff --git a/fs/splice.c b/fs/splice.c index 50a5d978da16..e76aac5c9931 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -259,6 +259,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } +EXPORT_SYMBOL_GPL(splice_to_pipe); static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { diff --git a/include/asm-generic/trace-clock.h b/include/asm-generic/trace-clock.h new file mode 100644 index 000000000000..138ac9b7c546 --- /dev/null +++ b/include/asm-generic/trace-clock.h @@ -0,0 +1,76 @@ +#ifndef _ASM_GENERIC_TRACE_CLOCK_H +#define _ASM_GENERIC_TRACE_CLOCK_H + +/* + * include/asm-generic/trace-clock.h + * + * Copyright (C) 2007 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Generic tracing clock for architectures without TSC. + */ + +#include <linux/param.h> /* For HZ */ +#include <asm/atomic.h> + +#define TRACE_CLOCK_SHIFT 13 + +/* + * Number of hardware clock bits. The higher order bits are expected to be 0. + * If the hardware clock source has more than 32 bits, the bits higher than the + * 32nd will be truncated by a cast to a 32 bits unsigned. Range : 1 - 32. + * (too few bits would be unrealistic though, since we depend on the timer to + * detect the overflows). + */ +#define TC_HW_BITS 32 + +/* Expected maximum interrupt latency in ms : 15ms, *2 for security */ +#define TC_EXPECTED_INTERRUPT_LATENCY 30 + +extern atomic_long_t trace_clock_var; + +static inline u32 trace_clock_read32(void) +{ + return (u32)atomic_long_add_return(1, &trace_clock_var); +} + +#ifdef CONFIG_HAVE_TRACE_CLOCK_32_TO_64 +extern u64 trace_clock_read_synthetic_tsc(void); +extern void get_synthetic_tsc(void); +extern void put_synthetic_tsc(void); + +static inline u64 trace_clock_read64(void) +{ + return trace_clock_read_synthetic_tsc(); +} +#else +static inline void get_synthetic_tsc(void) +{ +} + +static inline void put_synthetic_tsc(void) +{ +} + +static inline u64 trace_clock_read64(void) +{ + return atomic_long_add_return(1, &trace_clock_var); +} +#endif + +static inline unsigned int trace_clock_frequency(void) +{ + return HZ << TRACE_CLOCK_SHIFT; +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +extern void get_trace_clock(void); +extern void put_trace_clock(void); + +static inline void set_trace_clock_is_sync(int state) +{ +} +#endif /* _ASM_GENERIC_TRACE_CLOCK_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index fe77e3395b40..9d4b63d7ab0e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -167,6 +167,12 @@ CPU_KEEP(exit.data) \ MEM_KEEP(init.data) \ MEM_KEEP(exit.data) \ + . = ALIGN(128); \ + VMLINUX_SYMBOL(__start___markers) = .; \ + *(__markers) \ + VMLINUX_SYMBOL(__stop___markers) = .; \ + . = ALIGN(32); \ + VMLINUX_SYMBOL(__start___tracepoints) = .; \ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ diff --git a/include/linux/Kbuild b/include/linux/Kbuild index b0ada6f37dd6..a63b8001b7e2 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -42,6 +42,7 @@ header-y += adfs_fs.h header-y += affs_hardblocks.h header-y += agpgart.h header-y += aio_abi.h +header-y += align.h header-y += apm_bios.h header-y += arcfb.h header-y += atalk.h diff --git a/include/linux/align.h b/include/linux/align.h new file mode 100644 index 000000000000..68bad1ac089f --- /dev/null +++ b/include/linux/align.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_ALIGN_H +#define _LINUX_ALIGN_H + +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) \ + (((x) + (mask)) & ~(mask)) + +#ifdef __KERNEL__ + +#include <linux/types.h> + +#define ALIGN(x, a) __ALIGN_KERNEL(x, a) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK(x, mask) +#define PTR_ALIGN(p, a) ((typeof(p)) ALIGN((unsigned long) (p), a)) +#define ALIGN_FLOOR(x, a) __ALIGN_FLOOR_MASK(x, (typeof(x)) (a) - 1) +#define __ALIGN_FLOOR_MASK(x, mask) ((x) & ~(mask)) +#define PTR_ALIGN_FLOOR(p, a) \ + ((typeof(p)) ALIGN_FLOOR((unsigned long) (p), a)) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x)) (a) - 1)) == 0) + +/* + * Align pointer on natural object alignment. + */ +#define object_align(obj) PTR_ALIGN(obj, __alignof__(*(obj))) +#define object_align_floor(obj) PTR_ALIGN_FLOOR(obj, __alignof__(*(obj))) + +#define MAYBE_BUILD_BUG_ON(condition) \ +do { \ + if (__builtin_constant_p(condition)) \ + BUILD_BUG_ON(condition); \ +} while (0) + +/** + * offset_align - Calculate the offset needed to align an object on its natural + * alignment towards higher addresses. + * @align_drift: object offset from an "alignment"-aligned address. + * @alignment: natural object alignment. Must be non-zero, power of 2. + * + * Returns the offset that must be added to align towards higher + * addresses. + */ +#define offset_align(align_drift, alignment) \ + ({ \ + MAYBE_BUILD_BUG_ON((alignment) == 0 \ + || ((alignment) & ((alignment) - 1))); \ + (((alignment) - (align_drift)) & ((alignment) - 1)); \ + }) + +/** + * offset_align_floor - Calculate the offset needed to align an object + * on its natural alignment towards lower addresses. + * @align_drift: object offset from an "alignment"-aligned address. + * @alignment: natural object alignment. Must be non-zero, power of 2. + * + * Returns the offset that must be substracted to align towards lower addresses. + */ +#define offset_align_floor(align_drift, alignment) \ + ({ \ + MAYBE_BUILD_BUG_ON((alignment) == 0 \ + || ((alignment) & ((alignment) - 1))); \ + (((align_drift) - (alignment)) & ((alignment) - 1); \ + }) + +#endif /* __KERNEL__ */ + +#endif diff --git a/include/linux/idle.h b/include/linux/idle.h new file mode 100644 index 000000000000..75bd2a422c84 --- /dev/null +++ b/include/linux/idle.h @@ -0,0 +1,19 @@ +/* + * include/linux/idle.h - generic idle definition + * + */ +#ifndef _LINUX_IDLE_H_ +#define _LINUX_IDLE_H_ + +#include <linux/notifier.h> + +enum idle_val { + IDLE_START = 1, + IDLE_END = 2, +}; + +int notify_idle(enum idle_val val); +void register_idle_notifier(struct notifier_block *n); +void unregister_idle_notifier(struct notifier_block *n); + +#endif /* _LINUX_IDLE_H_ */ diff --git a/include/linux/immediate.h b/include/linux/immediate.h new file mode 100644 index 000000000000..0d62cab5c2e8 --- /dev/null +++ b/include/linux/immediate.h @@ -0,0 +1,93 @@ +#ifndef _LINUX_IMMEDIATE_H +#define _LINUX_IMMEDIATE_H + +/* + * Immediate values, can be updated at runtime and save cache lines. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * Dual BSD/GPL v2 license. + */ + +#ifdef CONFIG_IMMEDIATE + +struct __imv { + unsigned long var; /* Pointer to the identifier variable of the + * immediate value + */ + unsigned long imv; /* + * Pointer to the memory location of the + * immediate value within the instruction. + */ + unsigned char size; /* Type size. */ +} __attribute__ ((packed)); + +#include <asm/immediate.h> + +/** + * imv_set - set immediate variable (with locking) + * @name: immediate value name + * @i: required value + * + * Sets the value of @name, taking the module_mutex if required by + * the architecture. + */ +#define imv_set(name, i) \ + do { \ + name##__imv = (i); \ + core_imv_update(); \ + module_imv_update(); \ + } while (0) + +/* + * Internal update functions. + */ +extern void core_imv_update(void); +extern void imv_update_range(const struct __imv *begin, + const struct __imv *end); + +#else + +/* + * Generic immediate values: a simple, standard, memory load. + */ + +/** + * imv_read - read immediate variable + * @name: immediate value name + * + * Reads the value of @name. + */ +#define imv_read(name) _imv_read(name) + +/** + * imv_set - set immediate variable (with locking) + * @name: immediate value name + * @i: required value + * + * Sets the value of @name, taking the module_mutex if required by + * the architecture. + */ +#define imv_set(name, i) (name##__imv = (i)) + +static inline void core_imv_update(void) { } +static inline void module_imv_update(void) { } + +#endif + +#define DECLARE_IMV(type, name) extern __typeof__(type) name##__imv +#define DEFINE_IMV(type, name) __typeof__(type) name##__imv + +#define EXPORT_IMV_SYMBOL(name) EXPORT_SYMBOL(name##__imv) +#define EXPORT_IMV_SYMBOL_GPL(name) EXPORT_SYMBOL_GPL(name##__imv) + +/** + * _imv_read - Read immediate value with standard memory load. + * @name: immediate value name + * + * Force a data read of the immediate value instead of the immediate value + * based mechanism. Useful for __init and __exit section data read. + */ +#define _imv_read(name) (name##__imv) + +#endif diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 3bc4dcab6e82..3f8c992934ab 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -24,6 +24,7 @@ #else /* CONFIG_GENERIC_HARDIRQS */ extern int nr_irqs; +struct irq_desc; extern struct irq_desc *irq_to_desc(unsigned int irq); unsigned int irq_get_next_irq(unsigned int offset); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2fe6e84894a4..eff75504cf17 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -4,8 +4,8 @@ /* * 'kernel.h' contains some often-used function prototypes etc */ -#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) -#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#include <linux/align.h> #ifdef __KERNEL__ @@ -37,11 +37,6 @@ #define STACK_MAGIC 0xdeadbeef -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) - #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b5021db21858..914f0196130e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/preempt.h> +#include <linux/marker.h> #include <linux/msi.h> #include <linux/slab.h> #include <linux/rcupdate.h> diff --git a/include/linux/ltt-channels.h b/include/linux/ltt-channels.h new file mode 100644 index 000000000000..d8d368d5440b --- /dev/null +++ b/include/linux/ltt-channels.h @@ -0,0 +1,108 @@ +#ifndef _LTT_CHANNELS_H +#define _LTT_CHANNELS_H + +/* + * Copyright (C) 2008 Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Dynamic tracer channel allocation. + + * Dual LGPL v2.1/GPL v2 license. + */ + +#include <linux/limits.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/timer.h> +#include <linux/ltt-core.h> + +#define EVENTS_PER_CHANNEL 65536 + +/* + * Forward declaration of locking-specific per-cpu buffer structure. + */ +struct ltt_chanbuf; +struct ltt_trace; +struct ltt_serialize_closure; +struct ltt_probe_private_data; + +/* Serialization callback '%k' */ +typedef size_t (*ltt_serialize_cb)(struct ltt_chanbuf *buf, size_t buf_offset, + struct ltt_serialize_closure *closure, + void *serialize_private, + unsigned int stack_pos_ctx, + int *largest_align, + const char *fmt, va_list *args); + +struct ltt_probe_private_data { + struct ltt_trace *trace; /* + * Target trace, for metadata + * or statedump. + */ + ltt_serialize_cb serializer; /* + * Serialization function override. + */ + void *serialize_private; /* + * Private data for serialization + * functions. + */ +}; + +struct ltt_chan_alloc { + unsigned long buf_size; /* Size of the buffer */ + unsigned long sb_size; /* Sub-buffer size */ + unsigned int sb_size_order; /* Order of sub-buffer size */ + unsigned int n_sb_order; /* Number of sub-buffers per buffer */ + int extra_reader_sb:1; /* Bool: has extra reader subbuffer */ + struct ltt_chanbuf *buf; /* Channel per-cpu buffers */ + + struct kref kref; /* Reference count */ + unsigned long n_sb; /* Number of sub-buffers */ + struct dentry *parent; /* Associated parent dentry */ + struct dentry *ascii_dentry; /* Text output dentry */ + struct ltt_trace *trace; /* Associated trace */ + char filename[NAME_MAX]; /* Filename for channel files */ +}; + +struct ltt_chan { + struct ltt_chan_alloc a; /* Parent. First field. */ + int overwrite:1; + int active:1; + unsigned long commit_count_mask; /* + * Commit count mask, removing + * the MSBs corresponding to + * bits used to represent the + * subbuffer index. + */ + unsigned long switch_timer_interval; +}; + +struct ltt_channel_setting { + unsigned int sb_size; + unsigned int n_sb; + struct kref kref; /* Number of references to structure content */ + struct list_head list; + unsigned int index; /* index of channel in trace channel array */ + u16 free_event_id; /* Next event ID to allocate */ + char name[PATH_MAX]; +}; + +int ltt_channels_register(const char *name); +int ltt_channels_unregister(const char *name, int compacting); +int ltt_channels_set_default(const char *name, + unsigned int subbuf_size, + unsigned int subbuf_cnt); +const char *ltt_channels_get_name_from_index(unsigned int index); +int ltt_channels_get_index_from_name(const char *name); +int ltt_channels_trace_ref(void); +struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels, + int overwrite, int active); +void ltt_channels_trace_free(struct ltt_chan *channels, + unsigned int nr_channels); +void ltt_channels_trace_set_timer(struct ltt_chan *chan, + unsigned long interval); + +int _ltt_channels_get_event_id(const char *channel, const char *name); +int ltt_channels_get_event_id(const char *channel, const char *name); +void _ltt_channels_reset_event_ids(void); + +#endif /* _LTT_CHANNELS_H */ diff --git a/include/linux/ltt-core.h b/include/linux/ltt-core.h new file mode 100644 index 000000000000..acb696ed106e --- /dev/null +++ b/include/linux/ltt-core.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2005-2010 Mathieu Desnoyers (mathieu.desnoyers@efficios.com) + * + * This contains the core definitions for the Linux Trace Toolkit. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#ifndef LTT_CORE_H +#define LTT_CORE_H + +/* Keep track of trap nesting inside LTT */ +DECLARE_PER_CPU(unsigned int, ltt_nesting); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + +/* + * Calculate the offset needed to align the type. + * size_of_type must be non-zero. + */ +static inline unsigned int ltt_align(size_t align_drift, size_t size_of_type) +{ + return offset_align(align_drift, min(sizeof(void *), size_of_type)); +} +/* Default arch alignment */ +#define LTT_ALIGN + +static inline int ltt_get_alignment(void) +{ + return sizeof(void *); +} + +extern unsigned int ltt_fmt_largest_align(size_t align_drift, const char *fmt); + +#else + +static inline unsigned int ltt_align(size_t align_drift, + size_t size_of_type) +{ + return 0; +} + +#define LTT_ALIGN __attribute__((packed)) + +static inline int ltt_get_alignment(void) +{ + return 0; +} + +static inline unsigned int ltt_fmt_largest_align(size_t align_drift, + const char *fmt) +{ + return 0; +} + +#endif /* HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#endif /* LTT_CORE_H */ diff --git a/include/linux/marker.h b/include/linux/marker.h new file mode 100644 index 000000000000..c50c66d09f04 --- /dev/null +++ b/include/linux/marker.h @@ -0,0 +1,273 @@ +#ifndef _LINUX_MARKER_H +#define _LINUX_MARKER_H + +/* + * Code markup for dynamic and static tracing. + * + * See Documentation/marker.txt. + * + * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <stdarg.h> +#include <linux/types.h> +#include <linux/immediate.h> + +struct module; +struct marker; +struct marker_probe_array; + +/** + * marker_probe_func - Type of a marker probe function + * @mdata: marker data + * @probe_private: probe private data + * @call_private: call site private data + * @fmt: format string + * @args: variable argument list pointer. Use a pointer to overcome C's + * inability to pass this around as a pointer in a portable manner in + * the callee otherwise. + * + * Type of marker probe functions. They receive the mdata and need to parse the + * format string to recover the variable argument list. + */ +typedef void marker_probe_func(const struct marker *mdata, + void *probe_private, void *call_private, + const char *fmt, va_list *args); + +struct marker_probe_closure { + marker_probe_func *func; /* Callback */ + void *probe_private; /* Private probe data */ +}; + +struct marker { + const char *channel; /* Name of channel where to send data */ + const char *name; /* Marker name */ + const char *format; /* Marker format string, describing the + * variable argument list. + */ + DEFINE_IMV(char, state);/* Immediate value state. */ + char ptype; /* probe type : 0 : single, 1 : multi */ + /* Probe wrapper */ + u16 channel_id; /* Numeric channel identifier, dynamic */ + u16 event_id; /* Numeric event identifier, dynamic */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_array *multi; + const char *tp_name; /* Optional tracepoint name */ + void *tp_cb; /* Optional tracepoint callback */ +} __attribute__((aligned(128))); /* + * Aligned on 128 bytes because it is + * globally visible and gcc happily + * align these on the structure size. + * Keep in sync with vmlinux.lds.h. + */ + +#ifdef CONFIG_MARKERS + +#define _DEFINE_MARKER(channel, name, tp_name_str, tp_cb, format) \ + static const char __mstrtab_##channel##_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #channel "\0" #name "\0" format; \ + static struct marker __mark_##channel##_##name \ + __attribute__((section("__markers"), aligned(128))) = \ + { __mstrtab_##channel##_##name, \ + &__mstrtab_##channel##_##name[sizeof(#channel)], \ + &__mstrtab_##channel##_##name[sizeof(#channel) + \ + sizeof(#name)], \ + 0, 0, 0, 0, marker_probe_cb, \ + { __mark_empty_function, NULL}, \ + NULL, tp_name_str, tp_cb } + +#define DEFINE_MARKER(channel, name, format) \ + _DEFINE_MARKER(channel, name, NULL, NULL, format) + +#define DEFINE_MARKER_TP(channel, name, tp_name, tp_cb, format) \ + _DEFINE_MARKER(channel, name, #tp_name, tp_cb, format) + +/* + * Make sure the alignment of the structure in the __markers section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + * + * The "generic" argument controls which marker enabling mechanism must be used. + * If generic is true, a variable read is used. + * If generic is false, immediate values are used. + */ +#define __trace_mark(generic, channel, name, call_private, format, args...) \ + do { \ + DEFINE_MARKER(channel, name, format); \ + __mark_check_format(format, ## args); \ + if (!generic) { \ + if (unlikely(imv_read( \ + __mark_##channel##_##name.state))) \ + (*__mark_##channel##_##name.call) \ + (&__mark_##channel##_##name, \ + call_private, ## args); \ + } else { \ + if (unlikely(_imv_read( \ + __mark_##channel##_##name.state))) \ + (*__mark_##channel##_##name.call) \ + (&__mark_##channel##_##name, \ + call_private, ## args); \ + } \ + } while (0) + +#define __trace_mark_tp(channel, name, call_private, tp_name, tp_cb, \ + format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb, NULL); \ + } \ + DEFINE_MARKER_TP(channel, name, tp_name, tp_cb, format);\ + __mark_check_format(format, ## args); \ + (*__mark_##channel##_##name.call)(&__mark_##channel##_##name, \ + call_private, ## args); \ + } while (0) + +extern void marker_update_probe_range(struct marker *begin, + struct marker *end); + +#define GET_MARKER(channel, name) (__mark_##channel##_##name) + +#else /* !CONFIG_MARKERS */ +#define DEFINE_MARKER(channel, name, tp_name, tp_cb, format) +#define __trace_mark(generic, channel, name, call_private, format, args...) \ + __mark_check_format(format, ## args) +#define __trace_mark_tp(channel, name, call_private, tp_name, tp_cb, \ + format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb, NULL); \ + } \ + __mark_check_format(format, ## args); \ + } while (0) +static inline void marker_update_probe_range(struct marker *begin, + struct marker *end) +{ } +#define GET_MARKER(channel, name) +#endif /* CONFIG_MARKERS */ + +/** + * trace_mark - Marker using code patching + * @channel: marker channel (where to send the data), not quoted. + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker using optimized code patching technique (imv_read()) + * to be enabled when immediate values are present. + */ +#define trace_mark(channel, name, format, args...) \ + __trace_mark(0, channel, name, NULL, format, ## args) + +/** + * _trace_mark - Marker using variable read + * @channel: marker channel (where to send the data), not quoted. + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker using a standard memory read (_imv_read()) to be + * enabled. Should be used for markers in code paths where instruction + * modification based enabling is not welcome. (__init and __exit functions, + * lockdep, some traps, printk). + */ +#define _trace_mark(channel, name, format, args...) \ + __trace_mark(1, channel, name, NULL, format, ## args) + +/** + * trace_mark_tp - Marker in a tracepoint callback + * @channel: marker channel (where to send the data), not quoted. + * @name: marker name, not quoted. + * @tp_name: tracepoint name, not quoted. + * @tp_cb: tracepoint callback. Should have an associated global symbol so it + * is not optimized away by the compiler (should not be static). + * @format: format string + * @args...: variable argument list + * + * Places a marker in a tracepoint callback. + */ +#define trace_mark_tp(channel, name, tp_name, tp_cb, format, args...) \ + __trace_mark_tp(channel, name, NULL, tp_name, tp_cb, format, ## args) + +/** + * MARK_NOARGS - Format string for a marker with no argument. + */ +#define MARK_NOARGS " " + +extern void lock_markers(void); +extern void unlock_markers(void); + +extern void markers_compact_event_ids(void); + +/* To be used for string format validity checking with gcc */ +static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...) +{ +} + +#define __mark_check_format(format, args...) \ + do { \ + if (0) \ + ___mark_check_format(format, ## args); \ + } while (0) + +extern marker_probe_func __mark_empty_function; + +extern void marker_probe_cb(const struct marker *mdata, + void *call_private, ...); + +/* + * Connect a probe to a marker. + * private data pointer must be a valid allocated memory address, or NULL. + */ +extern int marker_probe_register(const char *channel, const char *name, + const char *format, marker_probe_func *probe, void *probe_private); + +/* + * Returns the private data given to marker_probe_register. + */ +extern int marker_probe_unregister(const char *channel, const char *name, + marker_probe_func *probe, void *probe_private); +/* + * Unregister a marker by providing the registered private data. + */ +extern int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private); + +extern void *marker_get_private_data(const char *channel, const char *name, + marker_probe_func *probe, int num); + +const char *marker_get_name_from_id(u16 channel_id, u16 event_id); +const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id); + +/* + * marker_synchronize_unregister must be called between the last marker probe + * unregistration and the first one of + * - the end of module exit function + * - the free of any resource used by the probes + * to ensure the code and data are valid for any possibly running probes. + */ +#define marker_synchronize_unregister() synchronize_sched() + +struct marker_iter { + struct module *module; + struct marker *marker; +}; + +extern void marker_iter_start(struct marker_iter *iter); +extern void marker_iter_next(struct marker_iter *iter); +extern void marker_iter_stop(struct marker_iter *iter); +extern void marker_iter_reset(struct marker_iter *iter); +extern int marker_get_iter_range(struct marker **marker, struct marker *begin, + struct marker *end); +extern int _is_marker_enabled(const char *channel, const char *name); +extern int is_marker_enabled(const char *channel, const char *name); +extern int is_marker_present(const char *channel, const char *name); +extern void marker_update_probes(void); + +#endif diff --git a/include/linux/module.h b/include/linux/module.h index 5de42043dff0..35c96eed6dce 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -15,6 +15,7 @@ #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> +#include <linux/marker.h> #include <linux/tracepoint.h> #include <linux/percpu.h> @@ -376,6 +377,10 @@ struct module /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; +#ifdef CONFIG_MARKERS + struct marker *markers; + unsigned int num_markers; +#endif #ifdef CONFIG_TRACEPOINTS struct tracepoint * const *tracepoints_ptrs; unsigned int num_tracepoints; @@ -574,6 +579,10 @@ int register_module_notifier(struct notifier_block * nb); int unregister_module_notifier(struct notifier_block * nb); extern void print_modules(void); +extern void list_modules(void *call_data); + +extern void module_update_markers(void); +extern int module_get_iter_markers(struct marker_iter *iter); extern void module_update_tracepoints(void); extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); @@ -694,6 +703,14 @@ static inline void print_modules(void) { } +static inline void list_modules(void *call_data) +{ +} + +static inline void module_update_markers(void) +{ +} + static inline void module_update_tracepoints(void) { } @@ -702,6 +719,12 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) { return 0; } + +static inline int module_get_iter_markers(struct marker_iter *iter) +{ + return 0; +} + #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b0340..f97738da2040 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -44,6 +44,7 @@ #include <linux/rculist.h> #include <linux/dmaengine.h> #include <linux/workqueue.h> +#include <trace/net.h> #include <linux/ethtool.h> #include <net/net_namespace.h> diff --git a/include/linux/poll.h b/include/linux/poll.h index 1a2ccd6f3823..09e1375d7688 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -81,6 +81,8 @@ static inline int poll_schedule(struct poll_wqueues *pwq, int state) return poll_schedule_timeout(pwq, state, NULL, 0); } +extern void poll_wait_set_exclusive(poll_table *p); + /* * Scaleable version of the fd_set. */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2dea94fc4402..ad33cb475d06 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -241,6 +241,16 @@ static inline void list_splice_init_rcu(struct list_head *list, #define list_first_entry_rcu(ptr, type, member) \ list_entry_rcu((ptr)->next, type, member) +#define __list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference_raw(list_next_rcu(head)); \ + pos != (head); \ + pos = rcu_dereference_raw(list_next_rcu((pos))) + +#define __list_for_each_entry_rcu(pos, head, member) \ + for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) + /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. @@ -288,6 +298,23 @@ static inline void list_splice_init_rcu(struct list_head *list, pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** + * list_for_each_entry_continue_rcu - continue iteration over typed rcu list + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + * It continues an iteration initiated by list_for_each_entry_rcu(). + */ +#define list_for_each_entry_continue_rcu(pos, head, member) \ + for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ + prefetch(pos->member.next), &pos->member != (head); \ + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) + + +/** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * diff --git a/include/linux/sched.h b/include/linux/sched.h index 777d8a5ed06b..d6cde3a56cc8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2620,6 +2620,9 @@ static inline unsigned long rlimit_max(unsigned int limit) return task_rlimit_max(current, limit); } +extern void clear_kernel_trace_flag_all_tasks(void); +extern void set_kernel_trace_flag_all_tasks(void); + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 03c0232b4169..34f8680a7b91 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -153,4 +153,25 @@ extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); + +/* + * Helpers for iteration over a list sorted by ascending head pointer address. + * To be used in contexts where preemption cannot be disabled to insure to + * continue iteration on a modified list starting at the same location where it + * stopped, or at a following location. It insures that the lost information + * will only be in elements added/removed from the list between iterations. + * void *pos is only used to get the next list element and may not be a valid + * list_head anymore when given to seq_sorted_list_start() or + * seq_sorted_list_start_head(). + */ +extern struct list_head *seq_sorted_list_start(struct list_head *head, + loff_t *ppos); +extern struct list_head *seq_sorted_list_start_head(struct list_head *head, + loff_t *ppos); +/* + * next must be called with an existing p node + */ +extern struct list_head *seq_sorted_list_next(void *p, struct list_head *head, + loff_t *ppos); + #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 4d559325d919..3d2cd9b993c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -341,6 +341,7 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); +extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -384,6 +385,8 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) } #endif +extern void ltt_dump_swap_files(void *call_data); + #else /* CONFIG_SWAP */ #define nr_swap_pages 0L @@ -508,6 +511,10 @@ mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) } #endif +static inline void ltt_dump_swap_files(void *call_data) +{ +} + #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cd42e30b7c6e..436a327d8037 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -76,6 +76,14 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) return __swp_entry_to_pte(arch_entry); } +static inline swp_entry_t page_swp_entry(struct page *page) +{ + swp_entry_t entry; + VM_BUG_ON(!PageSwapCache(page)); + entry.val = page_private(page); + return entry; +} + #ifdef CONFIG_MIGRATION static inline swp_entry_t make_migration_entry(struct page *page, int write) { diff --git a/include/linux/time.h b/include/linux/time.h index 1e6d3b59238d..8ae676f1e7c7 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -292,6 +292,8 @@ struct itimerval { #define CLOCK_MONOTONIC_RAW 4 #define CLOCK_REALTIME_COARSE 5 #define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_TRACE_FREQ 14 +#define CLOCK_TRACE 15 /* * The IDs of various hardware clocks: diff --git a/include/linux/trace-clock.h b/include/linux/trace-clock.h new file mode 100644 index 000000000000..273991a96388 --- /dev/null +++ b/include/linux/trace-clock.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_TRACE_CLOCK_H +#define _LINUX_TRACE_CLOCK_H + +/* + * Trace clock + * + * Chooses between an architecture specific clock or an atomic logical clock. + * + * Copyright (C) 2007,2008 Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + */ + +#ifdef CONFIG_HAVE_TRACE_CLOCK +#include <asm/trace-clock.h> +#else +#include <asm-generic/trace-clock.h> +#endif /* CONFIG_HAVE_TRACE_CLOCK */ +#endif /* _LINUX_TRACE_CLOCK_H */ diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 425bcfe56c62..f3b763efe119 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -1,8 +1,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer -#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_TIMER_H +#if !defined(_TRACE_EVENTS_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENTS_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> @@ -323,7 +323,7 @@ TRACE_EVENT(itimer_expire, (int) __entry->pid, (unsigned long long)__entry->now) ); -#endif /* _TRACE_TIMER_H */ +#endif /* _TRACE_EVENTS_TIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/include/trace/fault.h b/include/trace/fault.h new file mode 100644 index 000000000000..3277e303fc43 --- /dev/null +++ b/include/trace/fault.h @@ -0,0 +1,25 @@ +#ifndef _TRACE_FAULT_H +#define _TRACE_FAULT_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(page_fault_entry, + TP_PROTO(struct pt_regs *regs, int trapnr, + struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access), + TP_ARGS(regs, trapnr, mm, vma, address, write_access)); +DECLARE_TRACE(page_fault_exit, + TP_PROTO(int res), + TP_ARGS(res)); +DECLARE_TRACE(page_fault_nosem_entry, + TP_PROTO(struct pt_regs *regs, int trapnr, unsigned long address), + TP_ARGS(regs, trapnr, address)); +DECLARE_TRACE_NOARGS(page_fault_nosem_exit); +DECLARE_TRACE(page_fault_get_user_entry, + TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access), + TP_ARGS(mm, vma, address, write_access)); +DECLARE_TRACE(page_fault_get_user_exit, + TP_PROTO(int res), + TP_ARGS(res)); +#endif diff --git a/include/trace/filemap.h b/include/trace/filemap.h new file mode 100644 index 000000000000..14e90ba9a092 --- /dev/null +++ b/include/trace/filemap.h @@ -0,0 +1,19 @@ +#ifndef _TRACE_FILEMAP_H +#define _TRACE_FILEMAP_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(wait_on_page_start, + TP_PROTO(struct page *page, int bit_nr), + TP_ARGS(page, bit_nr)); +DECLARE_TRACE(wait_on_page_end, + TP_PROTO(struct page *page, int bit_nr), + TP_ARGS(page, bit_nr)); +DECLARE_TRACE(add_to_page_cache, + TP_PROTO(struct address_space *mapping, pgoff_t offset), + TP_ARGS(mapping, offset)); +DECLARE_TRACE(remove_from_page_cache, + TP_PROTO(struct address_space *mapping), + TP_ARGS(mapping)); + +#endif diff --git a/include/trace/fs.h b/include/trace/fs.h new file mode 100644 index 000000000000..efe7e477dc12 --- /dev/null +++ b/include/trace/fs.h @@ -0,0 +1,66 @@ +#ifndef _TRACE_FS_H +#define _TRACE_FS_H + +#include <linux/buffer_head.h> +#include <linux/time.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(fs_buffer_wait_start, + TP_PROTO(struct buffer_head *bh), + TP_ARGS(bh)); +DECLARE_TRACE(fs_buffer_wait_end, + TP_PROTO(struct buffer_head *bh), + TP_ARGS(bh)); +DECLARE_TRACE(fs_exec, + TP_PROTO(const char *filename), + TP_ARGS(filename)); +DECLARE_TRACE(fs_ioctl, + TP_PROTO(unsigned int fd, unsigned int cmd, unsigned long arg), + TP_ARGS(fd, cmd, arg)); +DECLARE_TRACE(fs_open, + TP_PROTO(int fd, char *filename), + TP_ARGS(fd, filename)); +DECLARE_TRACE(fs_close, + TP_PROTO(unsigned int fd), + TP_ARGS(fd)); +DECLARE_TRACE(fs_lseek, + TP_PROTO(unsigned int fd, long offset, unsigned int origin), + TP_ARGS(fd, offset, origin)); +DECLARE_TRACE(fs_llseek, + TP_PROTO(unsigned int fd, loff_t offset, unsigned int origin), + TP_ARGS(fd, offset, origin)); + +/* + * Probes must be aware that __user * may be modified by concurrent userspace + * or kernel threads. + */ +DECLARE_TRACE(fs_read, + TP_PROTO(unsigned int fd, char __user *buf, size_t count, ssize_t ret), + TP_ARGS(fd, buf, count, ret)); +DECLARE_TRACE(fs_write, + TP_PROTO(unsigned int fd, const char __user *buf, size_t count, + ssize_t ret), + TP_ARGS(fd, buf, count, ret)); +DECLARE_TRACE(fs_pread64, + TP_PROTO(unsigned int fd, char __user *buf, size_t count, loff_t pos, + ssize_t ret), + TP_ARGS(fd, buf, count, pos, ret)); +DECLARE_TRACE(fs_pwrite64, + TP_PROTO(unsigned int fd, const char __user *buf, size_t count, + loff_t pos, ssize_t ret), + TP_ARGS(fd, buf, count, pos, ret)); +DECLARE_TRACE(fs_readv, + TP_PROTO(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, ssize_t ret), + TP_ARGS(fd, vec, vlen, ret)); +DECLARE_TRACE(fs_writev, + TP_PROTO(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, ssize_t ret), + TP_ARGS(fd, vec, vlen, ret)); +DECLARE_TRACE(fs_select, + TP_PROTO(int fd, struct timespec *end_time), + TP_ARGS(fd, end_time)); +DECLARE_TRACE(fs_poll, + TP_PROTO(int fd), + TP_ARGS(fd)); +#endif diff --git a/include/trace/hugetlb.h b/include/trace/hugetlb.h new file mode 100644 index 000000000000..c18944e34018 --- /dev/null +++ b/include/trace/hugetlb.h @@ -0,0 +1,28 @@ +#ifndef _TRACE_HUGETLB_H +#define _TRACE_HUGETLB_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(hugetlb_page_release, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(hugetlb_page_grab, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(hugetlb_buddy_pgalloc, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(hugetlb_page_alloc, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(hugetlb_page_free, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(hugetlb_pages_reserve, + TP_PROTO(struct inode *inode, long from, long to, int ret), + TP_ARGS(inode, from, to, ret)); +DECLARE_TRACE(hugetlb_pages_unreserve, + TP_PROTO(struct inode *inode, long offset, long freed), + TP_ARGS(inode, offset, freed)); + +#endif diff --git a/include/trace/ipc.h b/include/trace/ipc.h new file mode 100644 index 000000000000..ea9dac190c3b --- /dev/null +++ b/include/trace/ipc.h @@ -0,0 +1,18 @@ +#ifndef _TRACE_IPC_H +#define _TRACE_IPC_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(ipc_msg_create, + TP_PROTO(long id, int flags), + TP_ARGS(id, flags)); +DECLARE_TRACE(ipc_sem_create, + TP_PROTO(long id, int flags), + TP_ARGS(id, flags)); +DECLARE_TRACE(ipc_shm_create, + TP_PROTO(long id, int flags), + TP_ARGS(id, flags)); +DECLARE_TRACE(ipc_call, + TP_PROTO(unsigned int call, unsigned int first), + TP_ARGS(call, first)); +#endif diff --git a/include/trace/ipv4.h b/include/trace/ipv4.h new file mode 100644 index 000000000000..388908a788ea --- /dev/null +++ b/include/trace/ipv4.h @@ -0,0 +1,14 @@ +#ifndef _TRACE_IPV4_H +#define _TRACE_IPV4_H + +#include <linux/inetdevice.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(ipv4_addr_add, + TP_PROTO(struct in_ifaddr *ifa), + TP_ARGS(ifa)); +DECLARE_TRACE(ipv4_addr_del, + TP_PROTO(struct in_ifaddr *ifa), + TP_ARGS(ifa)); + +#endif diff --git a/include/trace/ipv6.h b/include/trace/ipv6.h new file mode 100644 index 000000000000..ffb9b113048b --- /dev/null +++ b/include/trace/ipv6.h @@ -0,0 +1,14 @@ +#ifndef _TRACE_IPV6_H +#define _TRACE_IPV6_H + +#include <net/if_inet6.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(ipv6_addr_add, + TP_PROTO(struct inet6_ifaddr *ifa), + TP_ARGS(ifa)); +DECLARE_TRACE(ipv6_addr_del, + TP_PROTO(struct inet6_ifaddr *ifa), + TP_ARGS(ifa)); + +#endif diff --git a/include/trace/irq.h b/include/trace/irq.h new file mode 100644 index 000000000000..58d506369365 --- /dev/null +++ b/include/trace/irq.h @@ -0,0 +1,31 @@ +#ifndef _LTTNG_TRACE_IRQ_H +#define _LTTNG_TRACE_IRQ_H + +#include <linux/kdebug.h> +#include <linux/interrupt.h> + +/* + * action can be NULL if not available. + */ +DECLARE_TRACE(irq_entry, + TP_PROTO(unsigned int id, struct pt_regs *regs, + struct irqaction *action), + TP_ARGS(id, regs, action)); +DECLARE_TRACE(irq_exit, + TP_PROTO(irqreturn_t retval), + TP_ARGS(retval)); + +DECLARE_TRACE(irq_tasklet_low_entry, + TP_PROTO(struct tasklet_struct *t), + TP_ARGS(t)); +DECLARE_TRACE(irq_tasklet_low_exit, + TP_PROTO(struct tasklet_struct *t), + TP_ARGS(t)); +DECLARE_TRACE(irq_tasklet_high_entry, + TP_PROTO(struct tasklet_struct *t), + TP_ARGS(t)); +DECLARE_TRACE(irq_tasklet_high_exit, + TP_PROTO(struct tasklet_struct *t), + TP_ARGS(t)); + +#endif diff --git a/include/trace/kernel.h b/include/trace/kernel.h new file mode 100644 index 000000000000..ca61c54525b9 --- /dev/null +++ b/include/trace/kernel.h @@ -0,0 +1,31 @@ +#ifndef _TRACE_KERNEL_H +#define _TRACE_KERNEL_H + +#include <linux/tracepoint.h> +#include <linux/kexec.h> + +struct kimage; + +DECLARE_TRACE(kernel_printk, + TP_PROTO(unsigned long retaddr), + TP_ARGS(retaddr)); +DECLARE_TRACE(kernel_vprintk, + TP_PROTO(unsigned long retaddr, char *buf, int len), + TP_ARGS(retaddr, buf, len)); +DECLARE_TRACE(kernel_module_free, + TP_PROTO(struct module *mod), + TP_ARGS(mod)); +DECLARE_TRACE(kernel_module_load, + TP_PROTO(struct module *mod), + TP_ARGS(mod)); +DECLARE_TRACE(kernel_panic, + TP_PROTO(const char *fmt, va_list args), + TP_ARGS(fmt, args)); +DECLARE_TRACE(kernel_kernel_kexec, + TP_PROTO(struct kimage *image), + TP_ARGS(image)); +DECLARE_TRACE(kernel_crash_kexec, + TP_PROTO(struct kimage *image, struct pt_regs *regs), + TP_ARGS(image, regs)); + +#endif diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h new file mode 100644 index 000000000000..dbd4629d0f8d --- /dev/null +++ b/include/trace/lockdep.h @@ -0,0 +1,37 @@ +#ifndef _LTTNG_TRACE_LOCKDEP_H +#define _LTTNG_TRACE_LOCKDEP_H + +#include <linux/lockdep.h> +#include <linux/tracepoint.h> + +/* + * lockdep tracing must be very careful with respect to reentrancy. + * + * It should not use immediate values for activation because it involves + * traps called when the code patching is done. + */ +DECLARE_TRACE(lockdep_hardirqs_on, + TP_PROTO(unsigned long retaddr), + TP_ARGS(retaddr)); +DECLARE_TRACE(lockdep_hardirqs_off, + TP_PROTO(unsigned long retaddr), + TP_ARGS(retaddr)); +DECLARE_TRACE(lockdep_softirqs_on, + TP_PROTO(unsigned long retaddr), + TP_ARGS(retaddr)); +DECLARE_TRACE(lockdep_softirqs_off, + TP_PROTO(unsigned long retaddr), + TP_ARGS(retaddr)); + +/* FIXME : some duplication with lockdep TRACE EVENTs */ +DECLARE_TRACE(lockdep_lock_acquire, + TP_PROTO(unsigned long retaddr, unsigned int subclass, + struct lockdep_map *lock, int trylock, int read, + int hardirqs_off), + TP_ARGS(retaddr, subclass, lock, trylock, read, hardirqs_off)); +DECLARE_TRACE(lockdep_lock_release, + TP_PROTO(unsigned long retaddr, struct lockdep_map *lock, int nested), + TP_ARGS(retaddr, lock, nested)); + + +#endif /* _LTTNG_TRACE_LOCKDEP_H */ diff --git a/include/trace/net.h b/include/trace/net.h new file mode 100644 index 000000000000..91a0f02d0bc4 --- /dev/null +++ b/include/trace/net.h @@ -0,0 +1,40 @@ +#ifndef _TRACE_LTTNG_NET_H +#define _TRACE_LTTNG_NET_H + +#include <linux/tracepoint.h> + +struct sk_buff; +DECLARE_TRACE(lttng_net_dev_xmit, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); +DECLARE_TRACE(lttng_net_dev_receive, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); +DECLARE_TRACE(net_tcpv4_rcv, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); +DECLARE_TRACE(net_udpv4_rcv, + TP_PROTO(struct sk_buff *skb), + TP_ARGS(skb)); + +/* + * Note these first 2 traces are actually in __napi_schedule and net_rx_action + * respectively. The former is in __napi_schedule because it uses at-most-once + * logic and placing it in the calling routine (napi_schedule) would produce + * countless trace events that were effectively no-ops. napi_poll is + * implemented in net_rx_action, because thats where we do our polling on + * devices. The last trace point is in napi_complete, right where you would + * think it would be. + */ +struct napi_struct; +DECLARE_TRACE(net_napi_schedule, + TP_PROTO(struct napi_struct *n), + TP_ARGS(n)); +DECLARE_TRACE(net_napi_poll, + TP_PROTO(struct napi_struct *n), + TP_ARGS(n)); +DECLARE_TRACE(net_napi_complete, + TP_PROTO(struct napi_struct *n), + TP_ARGS(n)); + +#endif diff --git a/include/trace/page_alloc.h b/include/trace/page_alloc.h new file mode 100644 index 000000000000..c30a389ea917 --- /dev/null +++ b/include/trace/page_alloc.h @@ -0,0 +1,16 @@ +#ifndef _TRACE_PAGE_ALLOC_H +#define _TRACE_PAGE_ALLOC_H + +#include <linux/tracepoint.h> + +/* + * mm_page_alloc : page can be NULL. + */ +DECLARE_TRACE(page_alloc, + TP_PROTO(struct page *page, unsigned int order), + TP_ARGS(page, order)); +DECLARE_TRACE(page_free, + TP_PROTO(struct page *page, unsigned int order), + TP_ARGS(page, order)); + +#endif diff --git a/include/trace/pm.h b/include/trace/pm.h new file mode 100644 index 000000000000..84078bbe1dba --- /dev/null +++ b/include/trace/pm.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_PM_H +#define _TRACE_PM_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE_NOARGS(pm_idle_entry); +DECLARE_TRACE_NOARGS(pm_idle_exit); +DECLARE_TRACE_NOARGS(pm_suspend_entry); +DECLARE_TRACE_NOARGS(pm_suspend_exit); + +#endif diff --git a/include/trace/rcu.h b/include/trace/rcu.h new file mode 100644 index 000000000000..f551c2ca9a07 --- /dev/null +++ b/include/trace/rcu.h @@ -0,0 +1,43 @@ +#ifndef _TRACE_RCU_H +#define _TRACE_RCU_H + +#include <linux/tracepoint.h> +#include <linux/rcupdate.h> + +DECLARE_TRACE(rcu_classic_callback, + TP_PROTO(struct rcu_head *head), + TP_ARGS(head)); + +DECLARE_TRACE(rcu_classic_call_rcu, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +DECLARE_TRACE(rcu_classic_call_rcu_bh, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +DECLARE_TRACE(rcu_preempt_callback, + TP_PROTO(struct rcu_head *head), + TP_ARGS(head)); + +DECLARE_TRACE(rcu_preempt_call_rcu, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +DECLARE_TRACE(rcu_preempt_call_rcu_sched, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +DECLARE_TRACE(rcu_tree_callback, + TP_PROTO(struct rcu_head *head), + TP_ARGS(head)); + +DECLARE_TRACE(rcu_tree_call_rcu, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +DECLARE_TRACE(rcu_tree_call_rcu_bh, + TP_PROTO(struct rcu_head *head, unsigned long ip), + TP_ARGS(head, ip)); + +#endif diff --git a/include/trace/sched.h b/include/trace/sched.h new file mode 100644 index 000000000000..a4b0307c4d6a --- /dev/null +++ b/include/trace/sched.h @@ -0,0 +1,11 @@ +#ifndef _LTTNG_TRACE_SCHED_H +#define _LTTNG_TRACE_SCHED_H + +#include <linux/sched.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(sched_kthread_create, + TP_PROTO(void *fn, int pid), + TP_ARGS(fn, pid)); + +#endif /* _LTTNG_TRACE_SCHED_H */ diff --git a/include/trace/socket.h b/include/trace/socket.h new file mode 100644 index 000000000000..4e8a324575db --- /dev/null +++ b/include/trace/socket.h @@ -0,0 +1,77 @@ +#ifndef _TRACE_SOCKET_H +#define _TRACE_SOCKET_H + +#include <net/sock.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(socket_create, + TP_PROTO(int family, int type, int protocol, struct socket *sock, + int ret), + TP_ARGS(family, type, protocol, sock, ret)); + +DECLARE_TRACE(socket_bind, + TP_PROTO(int fd, struct sockaddr __user *umyaddr, int addrlen, int ret), + TP_ARGS(fd, umyaddr, addrlen, ret)); + +DECLARE_TRACE(socket_connect, + TP_PROTO(int fd, struct sockaddr __user *uservaddr, int addrlen, + int ret), + TP_ARGS(fd, uservaddr, addrlen, ret)); + +DECLARE_TRACE(socket_listen, + TP_PROTO(int fd, int backlog, int ret), + TP_ARGS(fd, backlog, ret)); + +DECLARE_TRACE(socket_accept, + TP_PROTO(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags, int ret), + TP_ARGS(fd, upeer_sockaddr, upeer_addrlen, flags, ret)); + +DECLARE_TRACE(socket_getsockname, + TP_PROTO(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len, int ret), + TP_ARGS(fd, usockaddr, usockaddr_len, ret)); + +DECLARE_TRACE(socket_getpeername, + TP_PROTO(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len, int ret), + TP_ARGS(fd, usockaddr, usockaddr_len, ret)); + +DECLARE_TRACE(socket_socketpair, + TP_PROTO(int family, int type, int protocol, int __user *usockvec, + int ret), + TP_ARGS(family, type, protocol, usockvec, ret)); + +DECLARE_TRACE(socket_sendmsg, + TP_PROTO(struct socket *sock, struct msghdr *msg, size_t size, int ret), + TP_ARGS(sock, msg, size, ret)); + +DECLARE_TRACE(socket_recvmsg, + TP_PROTO(struct socket *sock, struct msghdr *msg, size_t size, + int flags, int ret), + TP_ARGS(sock, msg, size, flags, ret)); + +DECLARE_TRACE(socket_setsockopt, + TP_PROTO(int fd, int level, int optname, char __user *optval, + int optlen, int ret), + TP_ARGS(fd, level, optname, optval, optlen, ret)); + +DECLARE_TRACE(socket_getsockopt, + TP_PROTO(int fd, int level, int optname, char __user *optval, + int __user *optlen, int ret), + TP_ARGS(fd, level, optname, optval, optlen, ret)); + +DECLARE_TRACE(socket_shutdown, + TP_PROTO(int fd, int how, int ret), + TP_ARGS(fd, how, ret)); + +/* + * socket_call + * + * We also trace socket_call so we can know which syscall is used by user + * (socket_call or sock_send...) + */ +DECLARE_TRACE(socket_call, + TP_PROTO(int call, unsigned long a0), + TP_ARGS(call, a0)); +#endif diff --git a/include/trace/swap.h b/include/trace/swap.h new file mode 100644 index 000000000000..bd035a7204e5 --- /dev/null +++ b/include/trace/swap.h @@ -0,0 +1,20 @@ +#ifndef _TRACE_SWAP_H +#define _TRACE_SWAP_H + +#include <linux/swap.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(swap_in, + TP_PROTO(struct page *page, swp_entry_t entry), + TP_ARGS(page, entry)); +DECLARE_TRACE(swap_out, + TP_PROTO(struct page *page), + TP_ARGS(page)); +DECLARE_TRACE(swap_file_open, + TP_PROTO(struct file *file, char *filename), + TP_ARGS(file, filename)); +DECLARE_TRACE(swap_file_close, + TP_PROTO(struct file *file), + TP_ARGS(file)); + +#endif diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 31966a4fb8cc..2f40e5e65a28 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -6,6 +6,7 @@ #include <linux/ftrace_event.h> #include <asm/ptrace.h> +#include <linux/tracepoint.h> /* @@ -54,4 +55,11 @@ int perf_sysexit_enable(struct ftrace_event_call *call); void perf_sysexit_disable(struct ftrace_event_call *call); #endif +DECLARE_TRACE(syscall_entry, + TP_PROTO(struct pt_regs *regs, long id), + TP_ARGS(regs, id)); +DECLARE_TRACE(syscall_exit, + TP_PROTO(long ret), + TP_ARGS(ret)); + #endif /* _TRACE_SYSCALL_H */ diff --git a/include/trace/timer.h b/include/trace/timer.h new file mode 100644 index 000000000000..9b2a852ca218 --- /dev/null +++ b/include/trace/timer.h @@ -0,0 +1,24 @@ +#ifndef _TRACE_TIMER_H +#define _TRACE_TIMER_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(timer_itimer_expired, + TP_PROTO(struct signal_struct *sig), + TP_ARGS(sig)); +DECLARE_TRACE(timer_itimer_set, + TP_PROTO(int which, struct itimerval *value), + TP_ARGS(which, value)); +DECLARE_TRACE(timer_set, + TP_PROTO(struct timer_list *timer), + TP_ARGS(timer)); +/* + * xtime_lock is taken when kernel_timer_update_time tracepoint is reached. + */ +DECLARE_TRACE(timer_update_time, + TP_PROTO(struct timespec *_xtime, struct timespec *_wall_to_monotonic), + TP_ARGS(_xtime, _wall_to_monotonic)); +DECLARE_TRACE(timer_timeout, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); +#endif diff --git a/include/trace/trap.h b/include/trace/trap.h new file mode 100644 index 000000000000..1b70c0499606 --- /dev/null +++ b/include/trace/trap.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_TRAP_H +#define _TRACE_TRAP_H + +#include <linux/tracepoint.h> + +DECLARE_TRACE(trap_entry, + TP_PROTO(struct pt_regs *regs, long id), + TP_ARGS(regs, id)); +DECLARE_TRACE_NOARGS(trap_exit); + +#endif diff --git a/init/Kconfig b/init/Kconfig index be788c0957d4..64da1a37121c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -797,6 +797,41 @@ config NET_NS endif # NAMESPACES +# +# Architectures with a 64-bits get_cycles() should select this. +# They should also define +# get_cycles_barrier() : instruction synchronization barrier if required +# get_cycles_rate() : cycle counter rate, in HZ. If 0, TSC are not synchronized +# across CPUs or their frequency may vary due to frequency scaling. +# +config HAVE_GET_CYCLES + def_bool n + +# +# Architectures with a specialized tracing clock should select this. +# +config HAVE_TRACE_CLOCK + def_bool n + +config HAVE_TRACE_CLOCK_GENERIC + bool + default y if (!HAVE_TRACE_CLOCK) + default n if HAVE_TRACE_CLOCK + select HAVE_TRACE_CLOCK_32_TO_64 if (!64BIT) + +# +# Architectures with only a 32-bits clock source should select this. +# +config HAVE_TRACE_CLOCK_32_TO_64 + def_bool n + +# +# Architectures which need to dynamically detect if their TSC is unsynchronized +# across cpus should select this. +# +config HAVE_UNSYNCHRONIZED_TSC + def_bool n + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select EVENTFD @@ -1263,8 +1298,18 @@ config PROFILING config TRACEPOINTS bool +config MARKERS + bool "Activate markers" + select TRACEPOINTS + help + Place an empty function call at each marker site. Can be + dynamically changed for a probe function. + source "arch/Kconfig" +config HAVE_LTT_DUMP_TABLES + def_bool n + endmenu # General setup config HAVE_GENERIC_DMA_COHERENT diff --git a/ipc/msg.c b/ipc/msg.c index 747b65507a91..6a0500c7b478 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -37,6 +37,7 @@ #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> +#include <trace/ipc.h> #include <asm/current.h> #include <asm/uaccess.h> @@ -71,6 +72,8 @@ struct msg_sender { #define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) +DEFINE_TRACE(ipc_msg_create); + static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS @@ -314,6 +317,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) struct ipc_namespace *ns; struct ipc_ops msg_ops; struct ipc_params msg_params; + long ret; ns = current->nsproxy->ipc_ns; @@ -324,7 +328,9 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) msg_params.key = key; msg_params.flg = msgflg; - return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); + ret = ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); + trace_ipc_msg_create(ret, msgflg); + return ret; } static inline unsigned long diff --git a/ipc/sem.c b/ipc/sem.c index 0e0d49bbb867..32026ae7aa1d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -86,6 +86,7 @@ #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> +#include <trace/ipc.h> #include <asm/uaccess.h> #include "util.h" @@ -118,6 +119,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] +DEFINE_TRACE(ipc_sem_create); + void sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; @@ -323,6 +326,7 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) struct ipc_namespace *ns; struct ipc_ops sem_ops; struct ipc_params sem_params; + long err; ns = current->nsproxy->ipc_ns; @@ -337,7 +341,9 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) sem_params.flg = semflg; sem_params.u.nsems = nsems; - return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); + err = ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); + trace_ipc_sem_create(err, semflg); + return err; } /* diff --git a/ipc/shm.c b/ipc/shm.c index 7d3bb22a9302..f9761bb2d116 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -39,6 +39,7 @@ #include <linux/nsproxy.h> #include <linux/mount.h> #include <linux/ipc_namespace.h> +#include <trace/ipc.h> #include <asm/uaccess.h> @@ -56,6 +57,8 @@ struct shm_file_data { static const struct file_operations shm_file_operations; static const struct vm_operations_struct shm_vm_ops; +DEFINE_TRACE(ipc_shm_create); + #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) #define shm_unlock(shp) \ @@ -456,6 +459,7 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) struct ipc_namespace *ns; struct ipc_ops shm_ops; struct ipc_params shm_params; + long err; ns = current->nsproxy->ipc_ns; @@ -467,7 +471,9 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) shm_params.flg = shmflg; shm_params.u.size = size; - return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + err = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + trace_ipc_shm_create(err, shmflg); + return err; } static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) diff --git a/ipc/syscall.c b/ipc/syscall.c index 1d6f53f6b562..daf35cd9e4ee 100644 --- a/ipc/syscall.c +++ b/ipc/syscall.c @@ -12,6 +12,9 @@ #include <linux/shm.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <trace/ipc.h> + +DEFINE_TRACE(ipc_call); SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, unsigned long, third, void __user *, ptr, long, fifth) @@ -21,6 +24,8 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; + trace_ipc_call(call, first); + switch (call) { case SEMOP: return sys_semtimedop(first, (struct sembuf __user *)ptr, diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba33..c039580ba3b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o +obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_BINFMT_ELF) += elfcore.o @@ -99,7 +100,10 @@ obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_MARKERS) += ltt-channels.o obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace/ +obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b1..0d9a3444614b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -514,6 +514,8 @@ struct files_struct *get_files_struct(struct task_struct *task) return files; } +EXPORT_SYMBOL(get_files_struct); + void put_files_struct(struct files_struct *files) { struct fdtable *fdt; @@ -535,6 +537,8 @@ void put_files_struct(struct files_struct *files) } } +EXPORT_SYMBOL(put_files_struct); + void reset_files_struct(struct files_struct *files) { struct task_struct *tsk = current; diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..5bb0bb184349 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -88,6 +88,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) @@ -1250,6 +1251,15 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* + * The state of the parent's TIF_KTRACE flag may have changed + * since it was copied in dup_task_struct() so we re-copy it here. + */ + if (test_thread_flag(TIF_KERNEL_TRACE)) + set_tsk_thread_flag(p, TIF_KERNEL_TRACE); + else + clear_tsk_thread_flag(p, TIF_KERNEL_TRACE); + /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a7190122..db864334a952 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include <linux/kernel_stat.h> #include <trace/events/irq.h> +#include <trace/irq.h> #include "internals.h" @@ -51,6 +52,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +DEFINE_TRACE(irq_entry); +DEFINE_TRACE(irq_exit); + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -63,6 +67,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; + trace_irq_entry(irq, NULL, action); + do { trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); @@ -116,5 +122,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) add_interrupt_randomness(irq); local_irq_disable(); + trace_irq_exit(retval); + return retval; } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2039bea31bdf..1c07afd307fa 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -109,6 +109,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } +EXPORT_SYMBOL_GPL(irq_to_desc); static void delete_irq_desc(unsigned int irq) { @@ -273,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +EXPORT_SYMBOL_GPL(irq_to_desc); struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153da..18fd8e919c0a 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -13,9 +13,13 @@ #include <linux/posix-timers.h> #include <linux/hrtimer.h> #include <trace/events/timer.h> +#include <trace/timer.h> #include <asm/uaccess.h> +DEFINE_TRACE(timer_itimer_expired); +DEFINE_TRACE(timer_itimer_set); + /** * itimer_get_remtime - get remaining time for the timer * @@ -124,6 +128,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) container_of(timer, struct signal_struct, real_timer); trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); + trace_timer_itimer_expired(sig); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; @@ -201,6 +206,8 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) !timeval_valid(&value->it_interval)) return -EINVAL; + trace_timer_itimer_set(which, value); + switch (which) { case ITIMER_REAL: again: diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7ebd..779f0031929e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/kmsg_dump.h> +#include <trace/kernel.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -40,6 +41,9 @@ #include <asm/system.h> #include <asm/sections.h> +DEFINE_TRACE(kernel_kernel_kexec); +DEFINE_TRACE(kernel_crash_kexec); + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -1066,6 +1070,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { + trace_kernel_crash_kexec(kexec_crash_image, regs); + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. @@ -1495,6 +1501,8 @@ int kernel_kexec(void) { int error = 0; + trace_kernel_kernel_kexec(kexec_image); + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; if (!kexec_image) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f5..e0841c537dbb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,6 +49,8 @@ #include "lockdep_internals.h" +#include <trace/lockdep.h> + #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -66,6 +68,13 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_TRACE(lockdep_hardirqs_on); +DEFINE_TRACE(lockdep_hardirqs_off); +DEFINE_TRACE(lockdep_softirqs_on); +DEFINE_TRACE(lockdep_softirqs_off); +DEFINE_TRACE(lockdep_lock_acquire); +DEFINE_TRACE(lockdep_lock_release); + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -2300,6 +2309,8 @@ void trace_hardirqs_on_caller(unsigned long ip) time_hardirqs_on(CALLER_ADDR0, ip); + trace_lockdep_hardirqs_on(ip); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2358,6 +2369,8 @@ void trace_hardirqs_off_caller(unsigned long ip) time_hardirqs_off(CALLER_ADDR0, ip); + trace_lockdep_hardirqs_off(ip); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2390,6 +2403,8 @@ void trace_softirqs_on(unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_softirqs_on(ip); + if (unlikely(!debug_locks)) return; @@ -2424,6 +2439,8 @@ void trace_softirqs_off(unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_softirqs_off(ip); + if (unlikely(!debug_locks)) return; @@ -2730,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; + trace_lockdep_lock_acquire(ip, subclass, lock, trylock, read, + hardirqs_off); + if (!prove_locking) check = 1; @@ -3108,6 +3128,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_lock_release(ip, lock, nested); + if (!check_unlock(curr, lock, ip)) return; diff --git a/kernel/ltt-channels.c b/kernel/ltt-channels.c new file mode 100644 index 000000000000..102513874ad1 --- /dev/null +++ b/kernel/ltt-channels.c @@ -0,0 +1,388 @@ +/* + * ltt/ltt-channels.c + * + * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * LTTng channel management. + * + * Author: + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include <linux/module.h> +#include <linux/ltt-channels.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/* + * ltt_channel_mutex may be nested inside the LTT trace mutex. + * ltt_channel_mutex mutex may be nested inside markers mutex. + */ +static DEFINE_MUTEX(ltt_channel_mutex); +static LIST_HEAD(ltt_channels); +/* + * Index of next channel in array. Makes sure that as long as a trace channel is + * allocated, no array index will be re-used when a channel is freed and then + * another channel is allocated. This index is cleared and the array indexeds + * get reassigned when the index_kref goes back to 0, which indicates that no + * more trace channels are allocated. + */ +static unsigned int free_index; +/* index_kref is protected by both ltt_channel_mutex and lock_markers */ +static struct kref index_kref; /* Keeps track of allocated trace channels */ + +static struct ltt_channel_setting *lookup_channel(const char *name) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (strcmp(name, iter->name) == 0) + return iter; + return NULL; +} + +/* + * Must be called when channel refcount falls to 0 _and_ also when the last + * trace is freed. This function is responsible for compacting the channel and + * event IDs when no users are active. + * + * Called with lock_markers() and channels mutex held. + */ +static void release_channel_setting(struct kref *kref) +{ + struct ltt_channel_setting *setting = container_of(kref, + struct ltt_channel_setting, kref); + struct ltt_channel_setting *iter; + + if (atomic_read(&index_kref.refcount) == 0 + && atomic_read(&setting->kref.refcount) == 0) { + list_del(&setting->list); + kfree(setting); + + free_index = 0; + list_for_each_entry(iter, <t_channels, list) { + iter->index = free_index++; + iter->free_event_id = 0; + } + } +} + +/* + * Perform channel index compaction when the last trace channel is freed. + * + * Called with lock_markers() and channels mutex held. + */ +static void release_trace_channel(struct kref *kref) +{ + struct ltt_channel_setting *iter, *n; + + list_for_each_entry_safe(iter, n, <t_channels, list) + release_channel_setting(&iter->kref); + if (atomic_read(&index_kref.refcount) == 0) + markers_compact_event_ids(); +} + +/* + * ltt_channel_trace_ref : Is there an existing trace session ? + * + * Must be called with lock_markers() held. + */ +int ltt_channels_trace_ref(void) +{ + return !!atomic_read(&index_kref.refcount); +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_ref); + +/** + * ltt_channels_register - Register a trace channel. + * @name: channel name + * + * Uses refcounting. + */ +int ltt_channels_register(const char *name) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (setting) { + if (atomic_read(&setting->kref.refcount) == 0) + goto init_kref; + else { + kref_get(&setting->kref); + goto end; + } + } + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) { + ret = -ENOMEM; + goto end; + } + list_add(&setting->list, <t_channels); + strncpy(setting->name, name, PATH_MAX-1); + setting->index = free_index++; +init_kref: + kref_init(&setting->kref); +end: + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_register); + +/** + * ltt_channels_unregister - Unregister a trace channel. + * @name: channel name + * @compacting: performing compaction + * + * Must be called with markers mutex held. + */ +int ltt_channels_unregister(const char *name, int compacting) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + if (!compacting) + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (!setting || atomic_read(&setting->kref.refcount) == 0) { + ret = -ENOENT; + goto end; + } + kref_put(&setting->kref, release_channel_setting); + if (!compacting && atomic_read(&index_kref.refcount) == 0) + markers_compact_event_ids(); +end: + if (!compacting) + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_unregister); + +/** + * ltt_channels_set_default - Set channel default behavior. + * @name: default channel name + * @sb_size: size of the subbuffers + * @n_sb: number of subbuffers + */ +int ltt_channels_set_default(const char *name, + unsigned int sb_size, + unsigned int n_sb) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (!setting || atomic_read(&setting->kref.refcount) == 0) { + ret = -ENOENT; + goto end; + } + setting->sb_size = sb_size; + setting->n_sb = n_sb; +end: + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_set_default); + +/** + * ltt_channels_get_name_from_index - get channel name from channel index + * @index: channel index + * + * Allows to lookup the channel name given its index. Done to keep the name + * information outside of each trace channel instance. + */ +const char *ltt_channels_get_name_from_index(unsigned int index) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (iter->index == index && atomic_read(&iter->kref.refcount)) + return iter->name; + return NULL; +} +EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index); + +static struct ltt_channel_setting * +ltt_channels_get_setting_from_name(const char *name) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (!strcmp(iter->name, name) + && atomic_read(&iter->kref.refcount)) + return iter; + return NULL; +} + +/** + * ltt_channels_get_index_from_name - get channel index from channel name + * @name: channel name + * + * Allows to lookup the channel index given its name. Done to keep the name + * information outside of each trace channel instance. + * Returns -1 if not found. + */ +int ltt_channels_get_index_from_name(const char *name) +{ + struct ltt_channel_setting *setting; + + setting = ltt_channels_get_setting_from_name(name); + if (setting) + return setting->index; + else + return -1; +} +EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name); + +/** + * ltt_channels_trace_alloc - Allocate channel structures for a trace + * @sb_size: subbuffer size. 0 uses default. + * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default. + * @flags: Default channel flags + * + * Use the current channel list to allocate the channels for a trace. + * Called with trace lock held. Does not perform the trace buffer allocation, + * because we must let the user overwrite specific channel sizes. + */ +struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels, + int overwrite, int active) +{ + struct ltt_chan *chan = NULL; + struct ltt_channel_setting *iter; + + lock_markers(); + mutex_lock(<t_channel_mutex); + if (!free_index) + goto end; + if (!atomic_read(&index_kref.refcount)) + kref_init(&index_kref); + else + kref_get(&index_kref); + *nr_channels = free_index; + chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL); + if (!chan) + goto end; + list_for_each_entry(iter, <t_channels, list) { + if (!atomic_read(&iter->kref.refcount)) + continue; + chan[iter->index].a.sb_size = iter->sb_size; + chan[iter->index].a.n_sb = iter->n_sb; + chan[iter->index].overwrite = overwrite; + chan[iter->index].active = active; + strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1); + chan[iter->index].switch_timer_interval = 0; + } +end: + mutex_unlock(<t_channel_mutex); + unlock_markers(); + return chan; +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc); + +/** + * ltt_channels_trace_free - Free one trace's channels + * @channels: channels to free + * + * Called with trace lock held. The actual channel buffers must be freed before + * this function is called. + */ +void ltt_channels_trace_free(struct ltt_chan *channels, + unsigned int nr_channels) +{ + lock_markers(); + mutex_lock(<t_channel_mutex); + kfree(channels); + kref_put(&index_kref, release_trace_channel); + mutex_unlock(<t_channel_mutex); + unlock_markers(); + marker_update_probes(); +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_free); + +/** + * ltt_channels_trace_set_timer - set switch timer + * @channel: channel + * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer. + */ + +void ltt_channels_trace_set_timer(struct ltt_chan *chan, + unsigned long interval) +{ + chan->switch_timer_interval = interval; +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer); + +/** + * _ltt_channels_get_event_id - get next event ID for a marker + * @channel: channel name + * @name: event name + * + * Returns a unique event ID (for this channel) or < 0 on error. + * Must be called with channels mutex held. + */ +int _ltt_channels_get_event_id(const char *channel, const char *name) +{ + struct ltt_channel_setting *setting; + int ret; + + setting = ltt_channels_get_setting_from_name(channel); + if (!setting) { + ret = -ENOENT; + goto end; + } + if (strcmp(channel, "metadata") == 0) { + if (strcmp(name, "core_marker_id") == 0) + ret = 0; + else if (strcmp(name, "core_marker_format") == 0) + ret = 1; + else + ret = -ENOENT; + goto end; + } + if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) { + ret = -ENOSPC; + goto end; + } + ret = setting->free_event_id++; +end: + return ret; +} + +/** + * ltt_channels_get_event_id - get next event ID for a marker + * @channel: channel name + * @name: event name + * + * Returns a unique event ID (for this channel) or < 0 on error. + */ +int ltt_channels_get_event_id(const char *channel, const char *name) +{ + int ret; + + mutex_lock(<t_channel_mutex); + ret = _ltt_channels_get_event_id(channel, name); + mutex_unlock(<t_channel_mutex); + return ret; +} + +/** + * ltt_channels_reset_event_ids - reset event IDs at compaction + * + * Called with lock marker and channel mutex held. + */ +void _ltt_channels_reset_event_ids(void) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + iter->free_event_id = 0; +} + +MODULE_LICENSE("GPL and additional rights"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management"); diff --git a/kernel/marker.c b/kernel/marker.c new file mode 100644 index 000000000000..eac8ebfc3b9e --- /dev/null +++ b/kernel/marker.c @@ -0,0 +1,1262 @@ +/* + * Copyright (C) 2007 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/marker.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/immediate.h> +#include <linux/ltt-channels.h> + +extern struct marker __start___markers[]; +extern struct marker __stop___markers[]; + +/* Set to 1 to enable marker debug output */ +static const int marker_debug; + +/* + * markers_mutex nests inside module_mutex. Markers mutex protects the builtin + * and module markers and the hash table. + * markers_mutex nests inside the trace lock, to ensure event ID consistency + * between the hash table and the marker section. + */ +static DEFINE_MUTEX(markers_mutex); + +void lock_markers(void) +{ + mutex_lock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(lock_markers); + +void unlock_markers(void) +{ + mutex_unlock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(unlock_markers); + +/* + * Marker hash table, containing the active markers. + * Protected by module_mutex. + */ +#define MARKER_HASH_BITS 6 +#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; +static struct hlist_head id_table[MARKER_TABLE_SIZE]; + +struct marker_probe_array { + struct rcu_head rcu; + struct marker_probe_closure c[0]; +}; + +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ +struct marker_entry { + struct hlist_node hlist; + struct hlist_node id_list; + char *format; + char *name; + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_array *multi; + int refcount; /* Number of times armed. 0 if disarmed. */ + u16 channel_id; + u16 event_id; + unsigned char ptype:1; + unsigned char format_allocated:1; + char channel[0]; /* Contains channel'\0'name'\0'format'\0' */ +}; + +/** + * __mark_empty_function - Empty probe callback + * @mdata: marker data + * @probe_private: probe private data + * @call_private: call site private data + * @fmt: format string + * @...: variable argument list + * + * Empty callback provided as a probe to the markers. By providing this to a + * disabled marker, we make sure the execution flow is always valid even + * though the function pointer change and the marker enabling are two distinct + * operations that modifies the execution flow of preemptible code. + */ +notrace void __mark_empty_function(const struct marker *mdata, + void *probe_private, void *call_private, const char *fmt, va_list *args) +{ +} +EXPORT_SYMBOL_GPL(__mark_empty_function); + +/* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +notrace void marker_probe_cb(const struct marker *mdata, + void *call_private, ...) +{ + va_list args; + char ptype; + + /* + * rcu_read_lock_sched does two things : disabling preemption to make + * sure the teardown of the callbacks can be done correctly when they + * are in modules and they insure RCU read coherency. + */ + rcu_read_lock_sched_notrace(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, call_private); + func(mdata, mdata->single.probe_private, call_private, + mdata->format, &args); + va_end(args); + } else { + struct marker_probe_array *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi->c[i].func; i++) { + va_start(args, call_private); + multi->c[i].func(mdata, multi->c[i].probe_private, + call_private, mdata->format, &args); + va_end(args); + } + } + rcu_read_unlock_sched_notrace(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +static notrace void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...) +{ + va_list args; /* not initialized */ + char ptype; + + rcu_read_lock_sched_notrace(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata, mdata->single.probe_private, call_private, + mdata->format, &args); + } else { + struct marker_probe_array *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi->c[i].func; i++) + multi->c[i].func(mdata, multi->c[i].probe_private, + call_private, mdata->format, &args); + } + rcu_read_unlock_sched_notrace(); +} + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_probe_array *multi = container_of(head, struct marker_probe_array, rcu); + kfree(multi); +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi->c[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi->c[i].func, + entry->multi->c[i].probe_private); + } +} + +static struct marker_probe_array * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_array *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) + if (old->c[nr_probes].func == probe + && old->c[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc(sizeof(struct marker_probe_array) + + ((nr_probes + 2) * sizeof(struct marker_probe_closure)), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new->c[0] = entry->single; + else + memcpy(&new->c[0], &old->c[0], + nr_probes * sizeof(struct marker_probe_closure)); + new->c[nr_probes].func = probe; + new->c[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_array * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_array *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) { + if ((!probe || old->c[nr_probes].func == probe) + && old->c[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old->c[i].func; i++) + if ((probe && old->c[i].func != probe) || + old->c[i].probe_private != probe_private) + entry->single = old->c[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc(sizeof(struct marker_probe_array) + + ((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure)), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old->c[i].func; i++) + if ((probe && old->c[i].func != probe) || + old->c[i].probe_private != probe_private) + new->c[j++] = old->c[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get marker if the marker is present in the marker hash table. + * Must be called with markers_mutex held. + * Returns NULL if not present. + */ +static struct marker_entry *get_marker(const char *channel, const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the marker to the marker hash table. Must be called with markers_mutex + * held. + */ +static struct marker_entry *add_marker(const char *channel, const char *name, + const char *format) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + size_t format_len = 0; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + if (format) + format_len = strlen(format) + 1; + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + printk(KERN_NOTICE + "Marker %s.%s busy\n", channel, name); + return ERR_PTR(-EBUSY); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct marker_entry) + + channel_len + name_len + format_len, + GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(e->channel, channel, channel_len); + e->name = &e->channel[channel_len]; + memcpy(e->name, name, name_len); + if (format) { + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + trace_mark(metadata, core_marker_format, + "channel %s name %s format %s", + e->channel, e->name, e->format); + } else { + e->format = NULL; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; + e->format_allocated = 0; + e->refcount = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the marker from the marker hash table. Must be called with mutex_lock + * held. Parameter "registered" indicates if the channel registration has been + * performed. + */ +static int remove_marker(const char *channel, const char *name, int registered, + int compacting) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + int found = 0; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + u32 hash; + int ret; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + + if (registered && ltt_channels_trace_ref()) + return 0; + + hlist_del(&e->hlist); + hlist_del(&e->id_list); + if (registered) { + ret = ltt_channels_unregister(e->channel, compacting); + WARN_ON(ret); + } + if (e->format_allocated) + kfree(e->format); + kfree(e); + return 0; +} + +/* + * Set the mark_entry format to the format found in the element. + */ +static int marker_set_format(struct marker_entry *entry, const char *format) +{ + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) + return -ENOMEM; + entry->format_allocated = 1; + + trace_mark(metadata, core_marker_format, + "channel %s name %s format %s", + entry->channel, entry->name, entry->format); + return 0; +} + +/* + * Sets the probe callback corresponding to one marker. + */ +static int set_marker(struct marker_entry *entry, struct marker *elem, + int active) +{ + int ret = 0; + WARN_ON(strcmp(entry->name, elem->name) != 0); + + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { + printk(KERN_NOTICE + "Format mismatch for probe %s " + "(%s), marker (%s)\n", + entry->name, + entry->format, + elem->format); + return -EPERM; + } + } else { + ret = marker_set_format(entry, elem->format); + if (ret) + return ret; + } + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = entry->call; + elem->channel_id = entry->channel_id; + elem->event_id = entry->event_id; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = entry->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, entry->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = entry->ptype; + + if (elem->tp_name && (active ^ _imv_read(elem->state))) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + + if (active) { + /* + * try_module_get should always succeed because we hold + * lock_module() to get the tp_cb address. + */ + ret = try_module_get(__module_text_address( + (unsigned long)elem->tp_cb)); + BUG_ON(!ret); + ret = tracepoint_probe_register_noupdate( + elem->tp_name, + elem->tp_cb, NULL); + } else { + ret = tracepoint_probe_unregister_noupdate( + elem->tp_name, + elem->tp_cb, NULL); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address( + (unsigned long)elem->tp_cb)); + } + } + elem->state__imv = active; + + return ret; +} + +/* + * Disable a marker and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by rcu_read_lock_sched around the call site. + */ +static void disable_marker(struct marker *elem) +{ + int ret; + + /* leave "call" as is. It is known statically. */ + if (elem->tp_name && _imv_read(elem->state)) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + ret = tracepoint_probe_unregister_noupdate(elem->tp_name, + elem->tp_cb, NULL); + WARN_ON(ret); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address((unsigned long)elem->tp_cb)); + } + elem->state__imv = 0; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ + /* + * Leave the private data and channel_id/event_id there, because removal + * is racy and should be done only after an RCU period. These are never + * used until the next initialization anyway. + */ +} + +/* + * is_marker_present - Check if a marker is present in kernel. + * @channel: channel name + * @name: marker name + * + * We cannot take the marker lock around calls to this function because it needs + * to take the module mutex within the iterator. Marker mutex nests inside + * module mutex. + * Returns 1 if the marker is present, 0 if not. + */ +int is_marker_present(const char *channel, const char *name) +{ + int ret; + struct marker_iter iter; + + ret = 0; + + marker_iter_reset(&iter); + marker_iter_start(&iter); + for (; iter.marker != NULL; marker_iter_next(&iter)) { + if (!strcmp(iter.marker->channel, channel) && + !strcmp(iter.marker->name, name)) { + ret = 1; + goto end; + } + } +end: + marker_iter_stop(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(is_marker_present); + +/* + * _is_marker_enabled - Check if a marker is enabled, must be called with + * markers_mutex held. + * @channel: channel name + * @name: marker name + * + * Returns 1 if the marker is enabled, 0 if disabled. + */ +int _is_marker_enabled(const char *channel, const char *name) +{ + struct marker_entry *entry; + + entry = get_marker(channel, name); + + return entry && !!entry->refcount; +} +EXPORT_SYMBOL_GPL(_is_marker_enabled); + +/* + * is_marker_enabled - the wrapper of _is_marker_enabled + * @channel: channel name + * @name: marker name + * + * Returns 1 if the marker is enabled, 0 if disabled. + */ +int is_marker_enabled(const char *channel, const char *name) +{ + int ret; + + lock_markers(); + ret = _is_marker_enabled(channel, name); + unlock_markers(); + + return ret; +} +EXPORT_SYMBOL_GPL(is_marker_enabled); + +/** + * marker_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of markers. + */ +void marker_update_probe_range(struct marker *begin, + struct marker *end) +{ + struct marker *iter; + struct marker_entry *mark_entry; + + mutex_lock(&markers_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_marker(iter->channel, iter->name); + if (mark_entry) { + set_marker(mark_entry, iter, !!mark_entry->refcount); + /* + * ignore error, continue + */ + } else { + disable_marker(iter); + } + } + mutex_unlock(&markers_mutex); +} + +/* + * Update probes, removing the faulty probes. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). + */ +void marker_update_probes(void) +{ + /* Core kernel markers */ + marker_update_probe_range(__start___markers, __stop___markers); + /* Markers in modules. */ + module_update_markers(); + tracepoint_probe_update_all(); + /* Update immediate values */ + core_imv_update(); + module_imv_update(); +} + +/** + * marker_probe_register - Connect a probe to a marker + * @channel: marker channel + * @name: marker name + * @format: format string + * @probe: probe handler + * @probe_private: probe private data + * + * private data must be a valid allocated memory address, or NULL. + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int marker_probe_register(const char *channel, const char *name, + const char *format, marker_probe_func *probe, + void *probe_private) +{ + struct marker_entry *entry; + int ret = 0, ret_err; + struct marker_probe_array *old; + int first_probe = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(channel, name); + if (!entry) { + first_probe = 1; + entry = add_marker(channel, name, format); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + if (ret) + goto end; + ret = ltt_channels_register(channel); + if (ret) + goto error_remove_marker; + ret = ltt_channels_get_index_from_name(channel); + if (ret < 0) + goto error_unregister_channel; + entry->channel_id = ret; + ret = ltt_channels_get_event_id(channel, name); + if (ret < 0) + goto error_unregister_channel; + entry->event_id = ret; + hlist_add_head(&entry->id_list, id_table + hash_32( + (entry->channel_id << 16) | entry->event_id, + MARKER_HASH_BITS)); + ret = 0; + trace_mark(metadata, core_marker_id, + "channel %s name %s event_id %hu " + "int #1u%zu long #1u%zu pointer #1u%zu " + "size_t #1u%zu alignment #1u%u", + channel, name, entry->event_id, + sizeof(int), sizeof(long), sizeof(void *), + sizeof(size_t), ltt_get_alignment()); + } else if (format) { + if (!entry->format) + ret = marker_set_format(entry, format); + else if (strcmp(entry->format, format)) + ret = -EPERM; + if (ret) + goto end; + } + + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + if (first_probe) + goto error_unregister_channel; + else + goto end; + } + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + return ret; + +error_unregister_channel: + ret_err = ltt_channels_unregister(channel, 1); + WARN_ON(ret_err); +error_remove_marker: + ret_err = remove_marker(channel, name, 0, 0); + WARN_ON(ret_err); +end: + mutex_unlock(&markers_mutex); + marker_update_probes(); /* for compaction on error path */ + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_register); + +/** + * marker_probe_unregister - Disconnect a probe from a marker + * @channel: marker channel + * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data + * + * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister(const char *channel, const char *name, + marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + struct marker_probe_array *old; + int ret = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(channel, name); + if (!entry) { + ret = -ENOENT; + goto end; + } + old = marker_entry_remove_probe(entry, probe, probe_private); + remove_marker(channel, name, 1, 0); /* Ignore busy error message */ + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + return ret; + +end: + mutex_unlock(&markers_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister); + +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + unsigned int i; + struct hlist_head *head; + struct hlist_node *node; + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_array *closure; + closure = entry->multi; + for (i = 0; closure->c[i].func; i++) { + if (closure->c[i].func == probe && + closure->c[i].probe_private + == probe_private) + return entry; + } + } + } + } + return NULL; +} + +/** + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data + * + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) +{ + struct marker_entry *entry; + int ret = 0; + struct marker_probe_array *old; + const char *channel = NULL, *name = NULL; + + mutex_lock(&markers_mutex); + entry = get_marker_from_private_data(probe, probe_private); + if (!entry) { + ret = -ENOENT; + goto unlock; + } + old = marker_entry_remove_probe(entry, NULL, probe_private); + channel = kstrdup(entry->channel, GFP_KERNEL); + name = kstrdup(entry->name, GFP_KERNEL); + remove_marker(channel, name, 1, 0); /* Ignore busy error message */ + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + goto end; + +unlock: + mutex_unlock(&markers_mutex); +end: + kfree(channel); + kfree(name); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); + +/** + * marker_get_private_data - Get a marker's probe private data + * @channel: marker channel + * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data + * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. + * Returns the private data pointer, or an ERR_PTR. + * The private data pointer should _only_ be dereferenced if the caller is the + * owner of the data, or its content could vanish. This is mostly used to + * confirm that a caller is the owner of a registered probe. + */ +void *marker_get_private_data(const char *channel, const char *name, + marker_probe_func *probe, int num) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + int i; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + } else { + struct marker_probe_array *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure->c[i].func; i++) { + if (closure->c[i].func != probe) + continue; + if (match++ == num) + return closure->c[i].probe_private; + } + } + break; + } + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(marker_get_private_data); + +static struct marker_entry *get_entry_from_id(u16 channel_id, u16 event_id) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e, *found = NULL; + u32 hash = hash_32((channel_id << 16) | event_id, MARKER_HASH_BITS); + + mutex_lock(&markers_mutex); + head = id_table + hash; + hlist_for_each_entry(e, node, head, id_list) { + if (e->channel_id == channel_id && e->event_id == event_id) { + found = e; + break; + } + } + mutex_unlock(&markers_mutex); + return found; +} + +/* must call when ids/marker_entry are kept alive */ +const char *marker_get_name_from_id(u16 channel_id, u16 event_id) +{ + struct marker_entry *e = get_entry_from_id(channel_id, event_id); + return e ? e->name : NULL; +} +EXPORT_SYMBOL_GPL(marker_get_name_from_id); + +const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id) +{ + struct marker_entry *e = get_entry_from_id(channel_id, event_id); + return e ? e->format : NULL; +} +EXPORT_SYMBOL_GPL(marker_get_fmt_from_id); + +/** + * markers_compact_event_ids - Compact markers event IDs and reassign channels + * + * Called when no channel users are active by the channel infrastructure. + * Called with trace lock, lock_markers() and channel mutex held. + * + * marker_update_probes() must be executed after compaction before releasing the + * trace lock. + */ +void markers_compact_event_ids(void) +{ + struct marker_entry *entry; + unsigned int i; + struct hlist_head *head; + struct hlist_node *node, *next; + int ret; + + _ltt_channels_reset_event_ids(); + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry_safe(entry, node, next, head, hlist) { + if (!entry->refcount) { + remove_marker(entry->channel, entry->name, + 1, 1); + continue; + } + ret = ltt_channels_get_index_from_name(entry->channel); + WARN_ON(ret < 0); + entry->channel_id = ret; + ret = _ltt_channels_get_event_id(entry->channel, + entry->name); + WARN_ON(ret < 0); + entry->event_id = ret; + } + } + + memset(id_table, 0, sizeof(id_table)); + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + hlist_add_head(&entry->id_list, id_table + hash_32( + (entry->channel_id << 16) + | entry->event_id, MARKER_HASH_BITS)); + } + } +} + +#ifdef CONFIG_MODULES + +/** + * marker_get_iter_range - Get a next marker iterator given a range. + * @marker: current markers (in), next marker (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next marker has been found (1) or not (0). + * Will return the first marker in the range if the input marker is NULL. + */ +int marker_get_iter_range(struct marker **marker, struct marker *begin, + struct marker *end) +{ + if (!*marker && begin != end) { + *marker = begin; + return 1; + } + if (*marker >= begin && *marker < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(marker_get_iter_range); + +static void marker_get_iter(struct marker_iter *iter) +{ + int found = 0; + + /* Core kernel markers */ + if (!iter->module) { + found = marker_get_iter_range(&iter->marker, + __start___markers, __stop___markers); + if (found) + goto end; + } + /* Markers in modules. */ + found = module_get_iter_markers(iter); +end: + if (!found) + marker_iter_reset(iter); +} + +void marker_iter_start(struct marker_iter *iter) +{ + marker_get_iter(iter); +} +EXPORT_SYMBOL_GPL(marker_iter_start); + +void marker_iter_next(struct marker_iter *iter) +{ + iter->marker++; + /* + * iter->marker may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the markers, getting the + * markers from following modules if necessary. + */ + marker_get_iter(iter); +} +EXPORT_SYMBOL_GPL(marker_iter_next); + +void marker_iter_stop(struct marker_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(marker_iter_stop); + +void marker_iter_reset(struct marker_iter *iter) +{ + iter->module = NULL; + iter->marker = NULL; +} +EXPORT_SYMBOL_GPL(marker_iter_reset); + +int marker_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + case MODULE_STATE_GOING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + } + return 0; +} + +struct notifier_block marker_module_nb = { + .notifier_call = marker_module_notify, + .priority = 0, +}; + +static int init_markers(void) +{ + return register_module_notifier(&marker_module_nb); +} +__initcall(init_markers); + +#endif /* CONFIG_MODULES */ + +void ltt_dump_marker_state(struct ltt_trace *trace) +{ + struct marker_entry *entry; + struct ltt_probe_private_data call_data; + struct hlist_head *head; + struct hlist_node *node; + unsigned int i; + + mutex_lock(&markers_mutex); + call_data.trace = trace; + call_data.serializer = NULL; + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + __trace_mark(0, metadata, core_marker_id, + &call_data, + "channel %s name %s event_id %hu " + "int #1u%zu long #1u%zu pointer #1u%zu " + "size_t #1u%zu alignment #1u%u", + entry->channel, + entry->name, + entry->event_id, + sizeof(int), sizeof(long), + sizeof(void *), sizeof(size_t), + ltt_get_alignment()); + if (entry->format) + __trace_mark(0, metadata, + core_marker_format, + &call_data, + "channel %s name %s format %s", + entry->channel, + entry->name, + entry->format); + } + } + mutex_unlock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(ltt_dump_marker_state); diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..0ea12e0d472c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,6 +56,7 @@ #include <linux/percpu.h> #include <linux/kmemleak.h> #include <linux/jump_label.h> +#include <trace/kernel.h> #include <linux/pfn.h> #define CREATE_TRACE_POINTS @@ -99,7 +100,9 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete uses stop_machine/add uses RCU list operations). + * Sorted by ascending list node address. + */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -120,6 +123,9 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; +DEFINE_TRACE(kernel_module_load); +DEFINE_TRACE(kernel_module_free); + int register_module_notifier(struct notifier_block * nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -1675,6 +1681,7 @@ static inline void unset_section_ro_nx(struct module *mod, void *module_region) /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + trace_kernel_module_free(mod); trace_module_free(mod); /* Delete from various lists */ @@ -2272,6 +2279,12 @@ static int copy_and_check(struct load_info *info, if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) return -ENOMEM; + /* + * Make sure the module text or data access never generates any page + * fault. + */ + vmalloc_sync_all(); + if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; goto free_hdr; @@ -2459,6 +2472,10 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ctors), &mod->num_ctors); #endif +#ifdef CONFIG_MARKERS + mod->markers = section_objs(info, "__markers", + sizeof(*mod->markers), &mod->num_markers); +#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -2717,7 +2734,7 @@ static struct module *load_module(void __user *umod, const char __user *uargs) { struct load_info info = { NULL, }; - struct module *mod; + struct module *mod, *iter; long err; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2799,7 +2816,23 @@ static struct module *load_module(void __user *umod, goto ddebug; module_bug_finalize(info.hdr, info.sechdrs, mod); + /* + * We sort the modules by struct module pointer address to permit + * correct iteration over modules of, at least, kallsyms for preemptible + * operations, such as read(). Sorting by struct module pointer address + * is equivalent to sort by list node address. + */ + list_for_each_entry_reverse(iter, &modules, list) { + BUG_ON(iter == mod); /* Should never be in the list twice */ + if (iter < mod) { + /* We belong to the location right after iter. */ + list_add_rcu(&mod->list, &iter->list); + goto module_added; + } + } + /* We should be added at the head of the list */ list_add_rcu(&mod->list, &modules); +module_added: mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ @@ -2817,6 +2850,7 @@ static struct module *load_module(void __user *umod, free_copy(&info); /* Done! */ + trace_kernel_module_load(mod); trace_module_load(mod); return mod; @@ -3196,12 +3230,12 @@ static char *module_flags(struct module *mod, char *buf) static void *m_start(struct seq_file *m, loff_t *pos) { mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); + return seq_sorted_list_start(&modules, pos); } static void *m_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &modules, pos); + return seq_sorted_list_next(p, &modules, pos); } static void m_stop(struct seq_file *m, void *p) @@ -3266,6 +3300,27 @@ static int __init proc_modules_init(void) module_init(proc_modules_init); #endif +void list_modules(void *call_data) +{ + /* Enumerate loaded modules */ + struct list_head *i; + struct module *mod; + unsigned long refcount = 0; + + mutex_lock(&module_mutex); + list_for_each(i, &modules) { + mod = list_entry(i, struct module, list); +#ifdef CONFIG_MODULE_UNLOAD + refcount = module_refcount(mod); +#endif + __trace_mark(0, module_state, list_module, call_data, + "name %s state %d refcount %lu", + mod->name, mod->state, refcount); + } + mutex_unlock(&module_mutex); +} +EXPORT_SYMBOL_GPL(list_modules); + /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { @@ -3393,12 +3448,59 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, + struct marker *marker, struct tracepoint * const *tp) { } EXPORT_SYMBOL(module_layout); #endif +#ifdef CONFIG_MARKERS +void module_update_markers(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!(mod->taints & TAINT_FORCED_MODULE)) + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_markers(struct marker_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!(iter_mod->taints & TAINT_FORCED_MODULE)) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->marker = NULL; + found = marker_get_iter_range(&iter->marker, + iter_mod->markers, + iter_mod->markers + iter_mod->num_markers); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif + #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb568..e84814271531 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -5,6 +5,7 @@ #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> +#include <linux/idle.h> /* * Notifier list for kernel code which wants to be called @@ -148,7 +149,7 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); - synchronize_rcu(); + synchronize_sched(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); @@ -178,9 +179,9 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, { int ret; - rcu_read_lock(); + rcu_read_lock_sched_notrace(); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - rcu_read_unlock(); + rcu_read_unlock_sched_notrace(); return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); @@ -584,3 +585,27 @@ int unregister_die_notifier(struct notifier_block *nb) return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier); + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +/* + * Trace last event before calling notifiers. Notifiers flush data from buffers + * before going to idle. + */ +int notrace notify_idle(enum idle_val val) +{ + return atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(notify_idle); + +void register_idle_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(register_idle_notifier); + +void unregister_idle_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(unregister_idle_notifier); diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..3fd05f5708ca 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,9 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/dmi.h> +#include <trace/kernel.h> + +DEFINE_TRACE(kernel_panic); #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -64,6 +67,10 @@ NORET_TYPE void panic(const char * fmt, ...) long i, i_next = 0; int state = 0; + va_start(args, fmt); + trace_kernel_panic(fmt, args); + va_end(args); + /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want diff --git a/kernel/printk.c b/kernel/printk.c index 36231525e22f..5b87a0ce0895 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -39,6 +39,7 @@ #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <trace/kernel.h> #include <linux/rculist.h> #include <asm/uaccess.h> @@ -67,6 +68,7 @@ int console_printk[4] = { MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); /* * Low level drivers may need that to know if they can schedule in @@ -136,6 +138,9 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +DEFINE_TRACE(kernel_printk); +DEFINE_TRACE(kernel_vprintk); + #ifdef CONFIG_PRINTK static char __log_buf[__LOG_BUF_LEN]; @@ -650,6 +655,7 @@ asmlinkage int printk(const char *fmt, ...) } #endif va_start(args, fmt); + trace_kernel_printk(_RET_IP_); r = vprintk(fmt, args); va_end(args); @@ -773,6 +779,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); + trace_kernel_vprintk(_RET_IP_, printk_buf, printed_len); p = printk_buf; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..a86e46b6bc1c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -47,6 +47,7 @@ #include <linux/mutex.h> #include <linux/time.h> #include <linux/kernel_stat.h> +#include <trace/rcu.h> #include "rcutree.h" @@ -145,6 +146,10 @@ int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; module_param(rcu_cpu_stall_suppress, int, 0644); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +DEFINE_TRACE(rcu_tree_call_rcu); +DEFINE_TRACE(rcu_tree_call_rcu_bh); +DEFINE_TRACE(rcu_tree_callback); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -1143,6 +1148,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); + trace_rcu_tree_callback(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1488,6 +1494,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { + trace_rcu_tree_call_rcu_bh(head, _RET_IP_); __call_rcu(head, func, &rcu_bh_state); } EXPORT_SYMBOL_GPL(call_rcu_bh); diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..44d889a4d48c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9254,3 +9254,57 @@ struct cgroup_subsys cpuacct_subsys = { }; #endif /* CONFIG_CGROUP_CPUACCT */ +static DEFINE_MUTEX(kernel_trace_mutex); +static int kernel_trace_refcount; + +/** + * clear_kernel_trace_flag_all_tasks - clears all TIF_KERNEL_TRACE thread flags. + * + * This function iterates on all threads in the system to clear their + * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the + * tasklist_lock held in copy_process() makes sure that once we finish clearing + * the thread flags, all threads have their flags cleared. + */ +void clear_kernel_trace_flag_all_tasks(void) +{ + struct task_struct *p; + struct task_struct *t; + + mutex_lock(&kernel_trace_mutex); + if (--kernel_trace_refcount) + goto end; + read_lock(&tasklist_lock); + do_each_thread(p, t) { + clear_tsk_thread_flag(t, TIF_KERNEL_TRACE); + } while_each_thread(p, t); + read_unlock(&tasklist_lock); +end: + mutex_unlock(&kernel_trace_mutex); +} +EXPORT_SYMBOL_GPL(clear_kernel_trace_flag_all_tasks); + +/** + * set_kernel_trace_flag_all_tasks - sets all TIF_KERNEL_TRACE thread flags. + * + * This function iterates on all threads in the system to set their + * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the + * tasklist_lock held in copy_process() makes sure that once we finish setting + * the thread flags, all threads have their flags set. + */ +void set_kernel_trace_flag_all_tasks(void) +{ + struct task_struct *p; + struct task_struct *t; + + mutex_lock(&kernel_trace_mutex); + if (kernel_trace_refcount++) + goto end; + read_lock(&tasklist_lock); + do_each_thread(p, t) { + set_tsk_thread_flag(t, TIF_KERNEL_TRACE); + } while_each_thread(p, t); + read_unlock(&tasklist_lock); +end: + mutex_unlock(&kernel_trace_mutex); +} +EXPORT_SYMBOL_GPL(set_kernel_trace_flag_all_tasks); diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..a25bf611d133 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,7 +23,10 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/marker.h> +#include <linux/kallsyms.h> #include <linux/tick.h> +#include <trace/irq.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -54,6 +57,20 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; +void ltt_dump_softirq_vec(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < 32; i++) { + sprint_symbol(namebuf, (unsigned long)softirq_vec[i].action); + __trace_mark(0, softirq_state, softirq_vec, call_data, + "id %d address %p symbol %s", + i, softirq_vec[i].action, namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_softirq_vec); + static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { @@ -61,6 +78,11 @@ char *softirq_to_name[NR_SOFTIRQS] = { "TASKLET", "SCHED", "HRTIMER", "RCU" }; +DEFINE_TRACE(irq_tasklet_high_entry); +DEFINE_TRACE(irq_tasklet_high_exit); +DEFINE_TRACE(irq_tasklet_low_entry); +DEFINE_TRACE(irq_tasklet_low_exit); + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -341,6 +363,7 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { + trace_softirq_raise(nr); __raise_softirq_irqoff(nr); /* @@ -440,7 +463,9 @@ static void tasklet_action(struct softirq_action *a) if (!atomic_read(&t->count)) { if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); + trace_irq_tasklet_low_entry(t); t->func(t->data); + trace_irq_tasklet_low_exit(t); tasklet_unlock(t); continue; } @@ -475,7 +500,9 @@ static void tasklet_hi_action(struct softirq_action *a) if (!atomic_read(&t->count)) { if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); + trace_irq_tasklet_high_entry(t); t->func(t->data); + trace_irq_tasklet_high_exit(t); tasklet_unlock(t); continue; } diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..dbaa0648631c 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_HAVE_UNSYNCHRONIZED_TSC) += tsc-sync.o diff --git a/kernel/time/tsc-sync.c b/kernel/time/tsc-sync.c new file mode 100644 index 000000000000..2ac1544ee224 --- /dev/null +++ b/kernel/time/tsc-sync.c @@ -0,0 +1,313 @@ +/* + * kernel/time/tsc-sync.c + * + * Test TSC synchronization + * + * marks the tsc as unstable _and_ keep a simple "_tsc_is_sync" variable, which + * is fast to read when a simple test must determine which clock source to use + * for kernel tracing. + * + * - CPU init : + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. + * + * - When CPUs are up : + * + * TSC synchronicity of all CPUs can be checked later at run-time by calling + * test_tsc_synchronization(). + * + * Copyright 2007, 2008 + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + */ +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/trace-clock.h> +#include <linux/cpu.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/cpu.h> + +#define MAX_CYCLES_DELTA 3000ULL + +/* + * Number of loops to take care of MCE, NMIs, SMIs. + */ +#define NR_LOOPS 200 + +static DEFINE_MUTEX(tscsync_mutex); + +struct sync_data { + int nr_waits; + int wait_sync; + cycles_t tsc_count; +} ____cacheline_aligned; + +/* 0 is master, 1 is slave */ +static struct sync_data sync_data[2] = { + [0 ... 1] = { + .nr_waits = 3 * NR_LOOPS + 1, + .wait_sync = 3 * NR_LOOPS + 1, + }, +}; + +int _tsc_is_sync = 1; +EXPORT_SYMBOL(_tsc_is_sync); + +static int force_tsc_sync; +static cycles_t slave_offset; +static int slave_offset_ready; /* for 32-bits architectures */ + +static int __init force_tsc_sync_setup(char *str) +{ + force_tsc_sync = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("force_tsc_sync=", force_tsc_sync_setup); + +/* + * Mark it noinline so we make sure it is not unrolled. + * Wait until value is reached. + */ +static noinline void tsc_barrier(long this_cpu) +{ + sync_core(); + sync_data[this_cpu].wait_sync--; + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1 - this_cpu].wait_sync >= + sync_data[this_cpu].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + sync_data[this_cpu].nr_waits--; + get_cycles_barrier(); + sync_data[this_cpu].tsc_count = get_cycles(); + get_cycles_barrier(); +} + +/* + * Worker thread called on each CPU. + * First wait with interrupts enabled, then wait with interrupt disabled, + * for precision. We are already bound to one CPU. + * this_cpu 0 : master + * this_cpu 1 : slave + */ +static void test_sync(void *arg) +{ + long this_cpu = (long)arg; + unsigned long flags; + + local_irq_save(flags); + /* Make sure the instructions are in I-CACHE */ + tsc_barrier(this_cpu); + tsc_barrier(this_cpu); + sync_data[this_cpu].wait_sync--; + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1 - this_cpu].wait_sync >= + sync_data[this_cpu].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + sync_data[this_cpu].nr_waits--; + /* + * Here, only the master will wait for the slave to reach this barrier. + * This makes sure that the master, which holds the mutex and will reset + * the barriers, waits for the slave to stop using the barrier values + * before it continues. This is only done at the complete end of all the + * loops. This is why there is a + 1 in original wait_sync value. + */ + if (sync_data[this_cpu].nr_waits == 1) + sync_data[this_cpu].wait_sync--; + local_irq_restore(flags); +} + +/* + * Each CPU (master and target) must decrement the wait_sync value twice (one + * for priming in cache), and also once after the get_cycles. After all the + * loops, one last synchronization is required to make sure the master waits + * for the slave before resetting the barriers. + */ +static void reset_barriers(void) +{ + int i; + + /* + * Wait until slave is done so that we don't overwrite + * wait_end_sync prematurely. + */ + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1].wait_sync >= sync_data[0].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + + for (i = 0; i < 2; i++) { + WARN_ON(sync_data[i].wait_sync != 0); + WARN_ON(sync_data[i].nr_waits != 1); + sync_data[i].wait_sync = 3 * NR_LOOPS + 1; + sync_data[i].nr_waits = 3 * NR_LOOPS + 1; + } +} + +/* + * Do loops (making sure no unexpected event changes the timing), keep the best + * one. The result of each loop is the highest tsc delta between the master CPU + * and the slaves. Stop CPU hotplug when this code is executed to make sure we + * are concurrency-safe wrt CPU hotplug also using this code. Test TSC + * synchronization even if we already "know" CPUs were not synchronized. This + * can be used as a test to check if, for some reason, the CPUs eventually got + * in sync after a CPU has been unplugged. This code is kept separate from the + * CPU hotplug code because the slave CPU executes in an IPI, which we want to + * keep as short as possible (this is happening while the system is running). + * Therefore, we do not send a single IPI for all the test loops, but rather + * send one IPI per loop. + */ +int test_tsc_synchronization(void) +{ + long cpu, master; + cycles_t max_diff = 0, diff, best_loop, worse_loop = 0; + int i; + + mutex_lock(&tscsync_mutex); + get_online_cpus(); + + printk(KERN_INFO + "checking TSC synchronization across all online CPUs:"); + + preempt_disable(); + master = smp_processor_id(); + for_each_online_cpu(cpu) { + if (master == cpu) + continue; + best_loop = (cycles_t)ULLONG_MAX; + for (i = 0; i < NR_LOOPS; i++) { + smp_call_function_single(cpu, test_sync, + (void *)1UL, 0); + test_sync((void *)0UL); + diff = abs(sync_data[1].tsc_count + - sync_data[0].tsc_count); + best_loop = min(best_loop, diff); + worse_loop = max(worse_loop, diff); + } + reset_barriers(); + max_diff = max(best_loop, max_diff); + } + preempt_enable(); + if (max_diff >= MAX_CYCLES_DELTA) { + printk(KERN_WARNING + "Measured %llu cycles TSC offset between CPUs," + " turning off TSC clock.\n", (u64)max_diff); + mark_tsc_unstable("check_tsc_sync_source failed"); + _tsc_is_sync = 0; + } else { + printk(" passed.\n"); + } + put_online_cpus(); + mutex_unlock(&tscsync_mutex); + return max_diff < MAX_CYCLES_DELTA; +} +EXPORT_SYMBOL_GPL(test_tsc_synchronization); + +/* + * Test synchronicity of a single core when it is hotplugged. + * Source CPU calls into this - waits for the freshly booted target CPU to + * arrive and then start the measurement: + */ +void __cpuinit check_tsc_sync_source(int cpu) +{ + cycles_t diff, abs_diff, + best_loop = (cycles_t)ULLONG_MAX, worse_loop = 0; + int i; + + /* + * No need to check if we already know that the TSC is not synchronized: + */ + if (!force_tsc_sync && unsynchronized_tsc()) { + /* + * Make sure we mark _tsc_is_sync to 0 if the TSC is found + * to be unsynchronized for other causes than non-synchronized + * TSCs across CPUs. + */ + _tsc_is_sync = 0; + set_trace_clock_is_sync(0); + return; + } + + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); + + for (i = 0; i < NR_LOOPS; i++) { + test_sync((void *)0UL); + diff = sync_data[1].tsc_count - sync_data[0].tsc_count; + abs_diff = abs(diff); + best_loop = min(best_loop, abs_diff); + worse_loop = max(worse_loop, abs_diff); + if (force_tsc_sync && best_loop == abs_diff) + slave_offset = diff; + } + reset_barriers(); + + if (!force_tsc_sync && best_loop >= MAX_CYCLES_DELTA) { + printk(" failed.\n"); + printk(KERN_WARNING + "Measured %llu cycles TSC offset between CPUs," + " turning off TSC clock.\n", (u64)best_loop); + mark_tsc_unstable("check_tsc_sync_source failed"); + _tsc_is_sync = 0; + set_trace_clock_is_sync(0); + } else { + printk(" %s.\n", !force_tsc_sync ? "passed" : "forced"); + } + if (force_tsc_sync) { + /* order slave_offset and slave_offset_ready writes */ + smp_wmb(); + slave_offset_ready = 1; + } +} + +/* + * Freshly booted CPUs call into this: + */ +void __cpuinit check_tsc_sync_target(void) +{ + int i; + + if (!force_tsc_sync && unsynchronized_tsc()) + return; + + for (i = 0; i < NR_LOOPS; i++) + test_sync((void *)1UL); + + /* + * Force slave synchronization if requested. + */ + if (force_tsc_sync) { + unsigned long flags; + cycles_t new_tsc; + + while (!slave_offset_ready) + cpu_relax(); + /* order slave_offset and slave_offset_ready reads */ + smp_rmb(); + local_irq_save(flags); + /* + * slave_offset is read when master has finished writing to it, + * and is protected by cpu hotplug serialization. + */ + new_tsc = get_cycles() - slave_offset; + write_tsc((u32)new_tsc, (u32)((u64)new_tsc >> 32)); + local_irq_restore(flags); + } +} diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..65cc58ce148f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -40,12 +40,14 @@ #include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> +#include <trace/timer.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> +#include <asm/irq_regs.h> #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -54,6 +56,10 @@ u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); +DEFINE_TRACE(timer_set); +DEFINE_TRACE(timer_update_time); +DEFINE_TRACE(timer_timeout); + /* * per-CPU timer vector definitions: */ @@ -366,6 +372,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } + trace_timer_set(timer); /* * Timers are FIFO: */ @@ -1303,8 +1310,13 @@ void run_local_timers(void) void do_timer(unsigned long ticks) { + struct timespec curtime, wtom; + jiffies_64 += ticks; update_wall_time(); + curtime = __current_kernel_time(); + wtom = __get_wall_to_monotonic(); + trace_timer_update_time(&curtime, &wtom); calc_global_load(ticks); } @@ -1387,7 +1399,9 @@ SYSCALL_DEFINE0(getegid) static void process_timeout(unsigned long __data) { - wake_up_process((struct task_struct *)__data); + struct task_struct *task = (struct task_struct *)__data; + trace_timer_timeout(task); + wake_up_process(task); } /** diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..614d9153a249 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace-clock-32-to-64.o +obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace-clock.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace-clock-32-to-64.c b/kernel/trace/trace-clock-32-to-64.c new file mode 100644 index 000000000000..843749a72ac1 --- /dev/null +++ b/kernel/trace/trace-clock-32-to-64.c @@ -0,0 +1,295 @@ +/* + * kernel/trace/trace-clock-32-to-64.c + * + * (C) Copyright 2006,2007,2008 - + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Extends a 32 bits clock source to a full 64 bits count, readable atomically + * from any execution context. + * + * notes : + * - trace clock 32->64 bits extended timer-based clock cannot be used for early + * tracing in the boot process, as it depends on timer interrupts. + * - The timer is only on one CPU to support hotplug. + * - We have the choice between schedule_delayed_work_on and an IPI to get each + * CPU to write the heartbeat. IPI has been chosen because it is considered + * faster than passing through the timer to get the work scheduled on all the + * CPUs. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <linux/cpu.h> +#include <linux/timex.h> +#include <linux/bitops.h> +#include <linux/trace-clock.h> +#include <linux/smp.h> +#include <linux/sched.h> /* needed due to include order problem on m68k */ +#include <linux/math64.h> + +#define HW_BITMASK ((1ULL << TC_HW_BITS) - 1) +#define HW_LS32(hw) ((hw) & HW_BITMASK) +#define SW_MS32(sw) ((sw) & ~HW_BITMASK) + +static DEFINE_SPINLOCK(synthetic_tsc_lock); +static int synthetic_tsc_refcount; /* Number of readers */ +static int synthetic_tsc_enabled; /* synth. TSC enabled on all online CPUs */ + +static DEFINE_PER_CPU(struct timer_list, tsc_timer); +static unsigned int precalc_expire; + +struct synthetic_tsc_struct { + union { + u64 val; + struct { +#ifdef __BIG_ENDIAN + u32 ms32; + u32 ls32; +#else + u32 ls32; + u32 ms32; +#endif + } sel; + } tsc[2]; + unsigned int index; /* Index of the current synth. tsc. */ +}; + +static DEFINE_PER_CPU(struct synthetic_tsc_struct, synthetic_tsc); + +/* Called from IPI or timer interrupt */ +static void update_synthetic_tsc(void) +{ + struct synthetic_tsc_struct *cpu_synth; + u32 tsc; + + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + tsc = trace_clock_read32(); /* Hardware clocksource read */ + + if (tsc < HW_LS32(cpu_synth->tsc[cpu_synth->index].sel.ls32)) { + unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */ + /* + * Overflow + * Non atomic update of the non current synthetic TSC, followed + * by an atomic index change. There is no write concurrency, + * so the index read/write does not need to be atomic. + */ + cpu_synth->tsc[new_index].val = + (SW_MS32(cpu_synth->tsc[cpu_synth->index].val) + | (u64)tsc) + (1ULL << TC_HW_BITS); + /* + * Ensure the compiler does not reorder index write. It makes + * sure all nested interrupts will see the new value before the + * new index is written. + */ + barrier(); + cpu_synth->index = new_index; /* atomic change of index */ + } else { + /* + * No overflow : We know that the only bits changed are + * contained in the 32 LS32s, which can be written to atomically. + */ + cpu_synth->tsc[cpu_synth->index].sel.ls32 = + SW_MS32(cpu_synth->tsc[cpu_synth->index].sel.ls32) | tsc; + } +} + +/* + * Should only be called when interrupts are off. Affects only current CPU. + */ +void _trace_clock_write_synthetic_tsc(u64 value) +{ + struct synthetic_tsc_struct *cpu_synth; + unsigned int new_index; + + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + new_index = 1 - cpu_synth->index; /* 0 <-> 1 */ + cpu_synth->tsc[new_index].val = value; + barrier(); + cpu_synth->index = new_index; /* atomic change of index */ +} + +/* Called from buffer switch : in _any_ context (even NMI) */ +u64 notrace trace_clock_read_synthetic_tsc(void) +{ + struct synthetic_tsc_struct *cpu_synth; + u64 ret; + unsigned int index; + u32 tsc; + + preempt_disable_notrace(); + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + index = ACCESS_ONCE(cpu_synth->index); /* atomic read */ + tsc = trace_clock_read32(); /* Hardware clocksource read */ + + /* Overflow detection */ + if (unlikely(tsc < HW_LS32(cpu_synth->tsc[index].sel.ls32))) + ret = (SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc) + + (1ULL << TC_HW_BITS); + else + ret = SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(trace_clock_read_synthetic_tsc); + +static void synthetic_tsc_ipi(void *info) +{ + update_synthetic_tsc(); +} + +/* + * tsc_timer_fct : - Timer function synchronizing synthetic TSC. + * @data: unused + * + * Guarantees at least 1 execution before low word of TSC wraps. + */ +static void tsc_timer_fct(unsigned long data) +{ + update_synthetic_tsc(); + + mod_timer_pinned(&per_cpu(tsc_timer, smp_processor_id()), + jiffies + precalc_expire); +} + +/* + * precalc_stsc_interval: - Precalculates the interval between the clock + * wraparounds. + */ +static int __init precalc_stsc_interval(void) +{ + u64 rem_freq, rem_interval; + + precalc_expire = + __iter_div_u64_rem(HW_BITMASK, ( + __iter_div_u64_rem(trace_clock_frequency(), + HZ * trace_clock_freq_scale(), &rem_freq) << 1 + ) + - 1 + - (TC_EXPECTED_INTERRUPT_LATENCY * HZ / 1000), &rem_interval) + >> 1; + WARN_ON(precalc_expire == 0); + printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n", + precalc_expire); + return 0; +} + +static void prepare_synthetic_tsc(int cpu) +{ + struct synthetic_tsc_struct *cpu_synth; + u64 local_count; + + cpu_synth = &per_cpu(synthetic_tsc, cpu); + local_count = trace_clock_read_synthetic_tsc(); + cpu_synth->tsc[0].val = local_count; + cpu_synth->index = 0; + smp_wmb(); /* Writing in data of CPU about to come up */ + init_timer_deferrable(&per_cpu(tsc_timer, cpu)); + per_cpu(tsc_timer, cpu).function = tsc_timer_fct; + per_cpu(tsc_timer, cpu).expires = jiffies + precalc_expire; +} + +static void enable_synthetic_tsc(int cpu) +{ + smp_call_function_single(cpu, synthetic_tsc_ipi, NULL, 1); + add_timer_on(&per_cpu(tsc_timer, cpu), cpu); +} + +static void disable_synthetic_tsc(int cpu) +{ + del_timer_sync(&per_cpu(tsc_timer, cpu)); +} + +/* + * hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Sets the new CPU's current synthetic TSC to the same value as the + * currently running CPU. + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + prepare_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + enable_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + disable_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +void get_synthetic_tsc(void) +{ + int cpu; + + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount++) + goto end; + + synthetic_tsc_enabled = 1; + for_each_online_cpu(cpu) { + prepare_synthetic_tsc(cpu); + enable_synthetic_tsc(cpu); + } +end: + spin_unlock(&synthetic_tsc_lock); +} +EXPORT_SYMBOL_GPL(get_synthetic_tsc); + +void put_synthetic_tsc(void) +{ + int cpu; + + spin_lock(&synthetic_tsc_lock); + WARN_ON(synthetic_tsc_refcount <= 0); + if (synthetic_tsc_refcount != 1 || !synthetic_tsc_enabled) + goto end; + + for_each_online_cpu(cpu) + disable_synthetic_tsc(cpu); + synthetic_tsc_enabled = 0; +end: + synthetic_tsc_refcount--; + spin_unlock(&synthetic_tsc_lock); +} +EXPORT_SYMBOL_GPL(put_synthetic_tsc); + +/* Called from CPU 0, before any tracing starts, to init each structure */ +static int __init init_synthetic_tsc(void) +{ + precalc_stsc_interval(); + hotcpu_notifier(hotcpu_callback, 3); + return 0; +} + +/* Before SMP is up */ +early_initcall(init_synthetic_tsc); diff --git a/kernel/trace/trace-clock.c b/kernel/trace/trace-clock.c new file mode 100644 index 000000000000..3ed1667aacba --- /dev/null +++ b/kernel/trace/trace-clock.c @@ -0,0 +1,97 @@ +/* + * kernel/trace/trace-clock.c + * + * (C) Copyright 2008 - + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Generic kernel tracing clock for architectures without TSC. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <linux/cpu.h> +#include <linux/timex.h> +#include <linux/bitops.h> +#include <linux/trace-clock.h> +#include <linux/jiffies.h> + +static int trace_clock_refcount; +static DEFINE_MUTEX(trace_clock_mutex); +static struct timer_list trace_clock_timer; +/* + * bits 0..12 : counter, atomically incremented + * bits 13..{32,64} : time counter, incremented each jiffy. + */ +atomic_long_t trace_clock_var; +EXPORT_SYMBOL(trace_clock_var); + +static void trace_clock_update(void) +{ + long old_clock, new_clock; + unsigned long ticks; + + /* + * Make sure we keep track of delayed timer. + */ + ticks = jiffies - trace_clock_timer.expires + 1; + /* Don't update if ticks is zero, time would go backward. */ + if (unlikely(!ticks)) + return; + do { + old_clock = atomic_long_read(&trace_clock_var); + new_clock = (old_clock + (ticks << TRACE_CLOCK_SHIFT)) + & (~((1 << TRACE_CLOCK_SHIFT) - 1)); + } while (atomic_long_cmpxchg(&trace_clock_var, old_clock, new_clock) + != old_clock); +} + +static void trace_clock_timer_fct(unsigned long data) +{ + trace_clock_update(); + trace_clock_timer.expires = jiffies + 1; + add_timer(&trace_clock_timer); +} + +static void enable_trace_clock(void) +{ + init_timer(&trace_clock_timer); + /* trace_clock_update() reads expires */ + trace_clock_timer.function = trace_clock_timer_fct; + trace_clock_timer.expires = jiffies + 1; + trace_clock_update(); + add_timer(&trace_clock_timer); +} + +static void disable_trace_clock(void) +{ + del_timer_sync(&trace_clock_timer); +} + +void get_trace_clock(void) +{ + get_synthetic_tsc(); + mutex_lock(&trace_clock_mutex); + if (trace_clock_refcount++) + goto end; + enable_trace_clock(); +end: + mutex_unlock(&trace_clock_mutex); +} +EXPORT_SYMBOL_GPL(get_trace_clock); + +void put_trace_clock(void) +{ + mutex_lock(&trace_clock_mutex); + WARN_ON(trace_clock_refcount <= 0); + if (trace_clock_refcount != 1) + goto end; + disable_trace_clock(); +end: + trace_clock_refcount--; + mutex_unlock(&trace_clock_mutex); + put_synthetic_tsc(); +} +EXPORT_SYMBOL_GPL(put_trace_clock); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..687699d365ae 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468b..74bb9f8acf39 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> /* for page_is_file_cache() */ +#include <trace/filemap.h> #include "internal.h" /* @@ -43,6 +44,11 @@ #include <asm/mman.h> +DEFINE_TRACE(wait_on_page_start); +DEFINE_TRACE(wait_on_page_end); +DEFINE_TRACE(add_to_page_cache); +DEFINE_TRACE(remove_from_page_cache); + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -120,6 +126,7 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + trace_remove_from_page_cache(mapping); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); @@ -419,6 +426,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + trace_add_to_page_cache(mapping, offset); if (PageSwapBacked(page)) __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); @@ -511,9 +519,11 @@ void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + trace_wait_on_page_start(page, bit_nr); if (test_bit(bit_nr, &page->flags)) __wait_on_bit(page_waitqueue(page), &wait, sync_page, TASK_UNINTERRUPTIBLE); + trace_wait_on_page_end(page, bit_nr); } EXPORT_SYMBOL(wait_on_page_bit); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb0b7c128015..2114fb2615e3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <trace/hugetlb.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -53,6 +54,14 @@ static unsigned long __initdata default_hstate_size; */ static DEFINE_SPINLOCK(hugetlb_lock); +DEFINE_TRACE(hugetlb_page_release); +DEFINE_TRACE(hugetlb_page_grab); +DEFINE_TRACE(hugetlb_buddy_pgalloc); +DEFINE_TRACE(hugetlb_page_alloc); +DEFINE_TRACE(hugetlb_page_free); +DEFINE_TRACE(hugetlb_pages_reserve); +DEFINE_TRACE(hugetlb_pages_unreserve); + /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. @@ -500,6 +509,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) VM_BUG_ON(h->order >= MAX_ORDER); + trace_hugetlb_page_release(page); h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { @@ -534,6 +544,7 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct address_space *mapping; + trace_hugetlb_page_free(page); mapping = (struct address_space *) page_private(page); set_page_private(page, 0); page->mapping = NULL; @@ -598,8 +609,10 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (h->order >= MAX_ORDER) - return NULL; + if (h->order >= MAX_ORDER) { + page = NULL; + goto end; + } page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| @@ -608,11 +621,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (page) { if (arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); - return NULL; + page = NULL; + goto end; } prep_new_huge_page(h, page, nid); } - +end: + trace_hugetlb_page_grab(page); return page; } @@ -781,7 +796,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); - return NULL; + page = NULL; + goto end; } else { h->nr_huge_pages++; h->surplus_huge_pages++; @@ -818,7 +834,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); - +end: + trace_hugetlb_buddy_pgalloc(page); return page; } @@ -1054,6 +1071,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, vma_commit_reservation(h, vma, addr); + trace_hugetlb_page_alloc(page); return page; } @@ -2837,7 +2855,8 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret, chg; + int ret = 0; + long chg; struct hstate *h = hstate_inode(inode); /* @@ -2846,7 +2865,7 @@ int hugetlb_reserve_pages(struct inode *inode, * and filesystem quota without using reserves */ if (acctflag & VM_NORESERVE) - return 0; + goto end; /* * Shared mappings base their reservation on the number of pages that @@ -2858,8 +2877,10 @@ int hugetlb_reserve_pages(struct inode *inode, chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; + if (!resv_map) { + ret = -ENOMEM; + goto end; + } chg = to - from; @@ -2867,12 +2888,16 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) - return chg; + if (chg < 0) { + ret = chg; + goto end; + } /* There must be enough filesystem quota for the mapping */ - if (hugetlb_get_quota(inode->i_mapping, chg)) - return -ENOSPC; + if (hugetlb_get_quota(inode->i_mapping, chg)) { + ret = -ENOSPC; + goto end; + } /* * Check enough hugepages are available for the reservation. @@ -2881,7 +2906,7 @@ int hugetlb_reserve_pages(struct inode *inode, ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); - return ret; + goto end; } /* @@ -2897,14 +2922,18 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); - return 0; +end: + trace_hugetlb_pages_reserve(inode, from, to, ret); + return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + long chg; + trace_hugetlb_pages_unreserve(inode, offset, freed); + chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); diff --git a/mm/memory.c b/mm/memory.c index 5823698c2b71..7c4cb4e8515b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,8 @@ #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> +#include <trace/swap.h> +#include <trace/fault.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -67,6 +69,10 @@ #include "internal.h" +DEFINE_TRACE(swap_in); +DEFINE_TRACE(page_fault_get_user_entry); +DEFINE_TRACE(page_fault_get_user_exit); + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -2747,6 +2753,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + trace_swap_in(page, entry); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdef1d4b4e47..3ea857c5e283 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <trace/page_alloc.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -129,6 +130,9 @@ void pm_restrict_gfp_mask(void) int pageblock_order __read_mostly; #endif +DEFINE_TRACE(page_alloc); +DEFINE_TRACE(page_free); + static void __free_pages_ok(struct page *page, unsigned int order); /* @@ -2165,6 +2169,7 @@ nopage: } return page; got_pg: + trace_page_alloc(page, order); if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..d262ffb0c2da 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,8 +18,11 @@ #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> +#include <trace/swap.h> #include <asm/pgtable.h> +DEFINE_TRACE(swap_out); + static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { @@ -109,6 +112,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) rw |= REQ_SYNC | REQ_UNPLUG; count_vm_event(PSWPOUT); set_page_writeback(page); + trace_swap_out(page); unlock_page(page); submit_bio(rw, bio); out: diff --git a/mm/swapfile.c b/mm/swapfile.c index 0341c5700e34..41a00cead759 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,12 +31,16 @@ #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> +#include <trace/swap.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/page_cgroup.h> +DEFINE_TRACE(swap_file_open); +DEFINE_TRACE(swap_file_close); + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); @@ -1669,6 +1673,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + trace_swap_file_close(swap_file); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); @@ -2129,6 +2134,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) swap_list.head = swap_list.next = type; else swap_info[prev]->next = type; + trace_swap_file_open(swap_file, name); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -2280,6 +2286,13 @@ int swap_duplicate(swp_entry_t entry) return err; } +struct swap_info_struct * +get_swap_info_struct(unsigned type) +{ + return swap_info[type]; +} +EXPORT_SYMBOL_GPL(get_swap_info_struct); + /* * @entry: swap entry for which we allocate swap cache. * @@ -2560,3 +2573,22 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } } + +void ltt_dump_swap_files(void *call_data) +{ + int type; + struct swap_info_struct *p = NULL; + + mutex_lock(&swapon_mutex); + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { + p = swap_info[type]; + if (!(p->flags & SWP_WRITEOK)) + continue; + __trace_mark(0, swap_state, statedump_swap_files, call_data, + "filp %p vfsmount %p dname %s", + p->swap_file, p->swap_file->f_vfsmnt, + p->swap_file->f_dentry->d_name.name); + } + mutex_unlock(&swapon_mutex); +} +EXPORT_SYMBOL_GPL(ltt_dump_swap_files); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9b166732e70..f2df4585ae29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2059,7 +2059,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); void __attribute__((weak)) vmalloc_sync_all(void) { } - +EXPORT_SYMBOL_GPL(vmalloc_sync_all); static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { diff --git a/net/core/dev.c b/net/core/dev.c index 8ae6631abcc2..4dd6af4b3739 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include <trace/events/skb.h> #include <linux/pci.h> #include <linux/inetdevice.h> +#include <trace/net.h> #include "net-sysfs.h" @@ -198,6 +199,13 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +DEFINE_TRACE(lttng_net_dev_xmit); +DEFINE_TRACE(lttng_net_dev_receive); +DEFINE_TRACE(net_napi_schedule); +DEFINE_TRACE(net_napi_poll); +DEFINE_TRACE(net_napi_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(net_napi_complete); + static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); @@ -2111,6 +2119,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } + trace_lttng_net_dev_xmit(skb); rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc); if (rc == NETDEV_TX_OK) @@ -2132,6 +2141,7 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + trace_lttng_net_dev_xmit(nskb); rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc); if (unlikely(rc != NETDEV_TX_OK)) { @@ -2733,6 +2743,8 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; + trace_lttng_net_dev_receive(skb); + if (netdev_tstamp_prequeue) net_timestamp_check(skb); @@ -3166,6 +3178,8 @@ int netif_receive_skb(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; + trace_lttng_net_dev_receive(skb); + #ifdef CONFIG_RPS { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -3617,6 +3631,8 @@ void __napi_schedule(struct napi_struct *n) { unsigned long flags; + trace_net_napi_schedule(n); + local_irq_save(flags); ____napi_schedule(&__get_cpu_var(softnet_data), n); local_irq_restore(flags); @@ -3631,6 +3647,7 @@ void __napi_complete(struct napi_struct *n) list_del(&n->poll_list); smp_mb__before_clear_bit(); clear_bit(NAPI_STATE_SCHED, &n->state); + trace_net_napi_complete(n); } EXPORT_SYMBOL(__napi_complete); @@ -3730,6 +3747,7 @@ static void net_rx_action(struct softirq_action *h) */ work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { + trace_net_napi_poll(n); work = n->poll(n, weight); trace_napi_poll(n); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index df4616fce929..a3af729a5e57 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -62,6 +62,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <trace/ipv4.h> static struct ipv4_devconf ipv4_devconf = { .data = { @@ -92,6 +93,9 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; +DEFINE_TRACE(ipv4_addr_add); +DEFINE_TRACE(ipv4_addr_del); + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -252,6 +256,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, struct in_ifaddr **ifap1 = &ifa1->ifa_next; while ((ifa = *ifap1) != NULL) { + trace_ipv4_addr_del(ifa); if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -358,6 +363,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + trace_ipv4_addr_add(ifa); } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02f583b3744a..61f72b24e7ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -81,6 +81,9 @@ #include <linux/crypto.h> #include <linux/scatterlist.h> +#include <trace/net.h> + +DEFINE_TRACE(net_tcpv4_rcv); int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; @@ -1543,6 +1546,9 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; + + trace_net_tcpv4_rcv(skb); + #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17959ee..21f5aac48d3d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <trace/net.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -125,6 +126,8 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) +DEFINE_TRACE(net_udpv4_rcv); + static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -1596,6 +1599,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; + trace_net_udpv4_rcv(skb); + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fd6782e3a038..62c96253c360 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -87,6 +87,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <trace/ipv6.h> /* Set to 3 to get tracing... */ #define ACONF_DEBUG 2 @@ -108,6 +109,9 @@ static inline u32 cstamp_delta(unsigned long cstamp) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) +DEFINE_TRACE(ipv6_addr_add); +DEFINE_TRACE(ipv6_addr_del); + #ifdef CONFIG_SYSCTL static void addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -676,6 +680,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, /* For caller */ in6_ifa_hold(ifa); + trace_ipv6_addr_add(ifa); + /* Add to big hash table */ hash = ipv6_addr_hash(addr); @@ -2212,6 +2218,7 @@ static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); + trace_ipv6_addr_del(ifp); ipv6_del_addr(ifp); /* If the last address is deleted administratively, diff --git a/net/socket.c b/net/socket.c index ac2219f90d5d..332c43007d10 100644 --- a/net/socket.c +++ b/net/socket.c @@ -98,6 +98,7 @@ #include <net/sock.h> #include <linux/netfilter.h> +#include <trace/socket.h> #include <linux/if_tun.h> #include <linux/ipv6_route.h> @@ -164,6 +165,21 @@ static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; static DEFINE_PER_CPU(int, sockets_in_use); +DEFINE_TRACE(socket_create); +DEFINE_TRACE(socket_bind); +DEFINE_TRACE(socket_connect); +DEFINE_TRACE(socket_listen); +DEFINE_TRACE(socket_accept); +DEFINE_TRACE(socket_getsockname); +DEFINE_TRACE(socket_getpeername); +DEFINE_TRACE(socket_socketpair); +DEFINE_TRACE(socket_sendmsg); +DEFINE_TRACE(socket_recvmsg); +DEFINE_TRACE(socket_setsockopt); +DEFINE_TRACE(socket_getsockopt); +DEFINE_TRACE(socket_shutdown); +DEFINE_TRACE(socket_call); + /* * Support routines. * Move socket addresses back and forth across the kernel/user @@ -564,7 +580,9 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, if (err) return err; - return sock->ops->sendmsg(iocb, sock, msg, size); + err = sock->ops->sendmsg(iocb, sock, msg, size); + trace_socket_sendmsg(sock, msg, size, err); + return err; } int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -682,6 +700,7 @@ static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock_iocb *si = kiocb_to_siocb(iocb); + int err; sock_update_classid(sock->sk); @@ -691,7 +710,9 @@ static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, si->size = size; si->flags = flags; - return sock->ops->recvmsg(iocb, sock, msg, size, flags); + err = sock->ops->recvmsg(iocb, sock, msg, size, flags); + trace_socket_recvmsg(sock, msg, size, flags, err); + return err; } static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, @@ -1299,8 +1320,10 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) { + retval = -EINVAL; + goto out; + } type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) @@ -1314,12 +1337,12 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) if (retval < 0) goto out_release; -out: - /* It may be already another descriptor 8) Not kernel problem. */ - return retval; - + goto out; out_release: sock_release(sock); +out: + trace_socket_create(family, type, protocol, sock, retval); + /* It may be already another descriptor 8) Not kernel problem. */ return retval; } @@ -1336,8 +1359,10 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int flags; flags = type & ~SOCK_TYPE_MASK; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) { + err = -EINVAL; + goto out; + } type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) @@ -1386,17 +1411,18 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, if (!err) err = put_user(fd2, &usockvec[1]); if (!err) - return 0; + goto out; sys_close(fd2); sys_close(fd1); - return err; + goto out; out_release_both: sock_release(sock2); out_release_1: sock_release(sock1); out: + trace_socket_socketpair(family, type, protocol, usockvec, err); return err; } @@ -1428,6 +1454,7 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) } fput_light(sock->file, fput_needed); } + trace_socket_bind(fd, umyaddr, addrlen, err); return err; } @@ -1455,6 +1482,7 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) fput_light(sock->file, fput_needed); } + trace_socket_listen(fd, backlog, err); return err; } @@ -1478,8 +1506,10 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int err, len, newfd, fput_needed; struct sockaddr_storage address; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) { + err = -EINVAL; + goto out; + } if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; @@ -1537,6 +1567,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, out_put: fput_light(sock->file, fput_needed); out: + trace_socket_accept(fd, upeer_sockaddr, upeer_addrlen, flags, err); return err; out_fd: fput(newfile); @@ -1586,6 +1617,7 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, out_put: fput_light(sock->file, fput_needed); out: + trace_socket_connect(fd, uservaddr, addrlen, err); return err; } @@ -1617,6 +1649,7 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, out_put: fput_light(sock->file, fput_needed); out: + trace_socket_getsockname(fd, usockaddr, usockaddr_len, err); return err; } @@ -1637,7 +1670,7 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); - return err; + goto out; } err = @@ -1648,6 +1681,8 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } +out: + trace_socket_getpeername(fd, usockaddr, usockaddr_len, err); return err; } @@ -1778,8 +1813,10 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, int err, fput_needed; struct socket *sock; - if (optlen < 0) - return -EINVAL; + if (optlen < 0) { + err = -EINVAL; + goto out; + } sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -1798,6 +1835,8 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, out_put: fput_light(sock->file, fput_needed); } +out: + trace_socket_setsockopt(fd, level, optname, optval, optlen, err); return err; } @@ -1829,6 +1868,7 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, out_put: fput_light(sock->file, fput_needed); } + trace_socket_getsockopt(fd, level, optname, optval, optlen, err); return err; } @@ -1848,6 +1888,7 @@ SYSCALL_DEFINE2(shutdown, int, fd, int, how) err = sock->ops->shutdown(sock, how); fput_light(sock->file, fput_needed); } + trace_socket_shutdown(fd, how, err); return err; } @@ -2249,6 +2290,8 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a0 = a[0]; a1 = a[1]; + trace_socket_call(call, a0); + switch (call) { case SYS_SOCKET: err = sys_socket(a0, a1, a[2]); diff --git a/samples/Kconfig b/samples/Kconfig index e03cf0e374d7..7b1eadec88a9 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -5,6 +5,12 @@ menuconfig SAMPLES if SAMPLES +config SAMPLE_MARKERS + tristate "Build markers examples -- loadable modules only" + depends on MARKERS && m + help + This build markers example modules. + config SAMPLE_TRACEPOINTS tristate "Build tracepoints examples -- loadable modules only" depends on TRACEPOINTS && m diff --git a/samples/Makefile b/samples/Makefile index f26c0959fd86..ab3546401973 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,4 +1,4 @@ # Makefile for Linux samples code obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \ - hw_breakpoint/ kfifo/ kdb/ + hw_breakpoint/ kfifo/ kdb/ markers/ diff --git a/samples/markers/Makefile b/samples/markers/Makefile new file mode 100644 index 000000000000..2244152159d7 --- /dev/null +++ b/samples/markers/Makefile @@ -0,0 +1,4 @@ +# builds the kprobes example kernel modules; +# then to use one (as root): insmod <module_name.ko> + +obj-$(CONFIG_SAMPLE_MARKERS) += probe-example.o marker-example.o test-multi.o diff --git a/samples/markers/marker-example.c b/samples/markers/marker-example.c new file mode 100644 index 000000000000..06afde476318 --- /dev/null +++ b/samples/markers/marker-example.c @@ -0,0 +1,53 @@ +/* marker-example.c + * + * Executes a marker when /proc/marker-example is opened. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/module.h> +#include <linux/marker.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> + +struct proc_dir_entry *pentry_example; + +static int my_open(struct inode *inode, struct file *file) +{ + int i; + + trace_mark(samples, subsystem_event, "integer %d string %s", 123, + "example string"); + for (i = 0; i < 10; i++) + trace_mark(samples, subsystem_eventb, MARK_NOARGS); + return -EPERM; +} + +static struct file_operations mark_ops = { + .open = my_open, +}; + +static int __init example_init(void) +{ + printk(KERN_ALERT "example init\n"); + pentry_example = proc_create("marker-example", 0444, NULL, &mark_ops); + if (!pentry_example) + return -EPERM; + return 0; +} + +static void __exit example_exit(void) +{ + printk(KERN_ALERT "example exit\n"); + remove_proc_entry("marker-example", NULL); +} + +module_init(example_init) +module_exit(example_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Marker example"); diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c new file mode 100644 index 000000000000..2c449c0c1eba --- /dev/null +++ b/samples/markers/probe-example.c @@ -0,0 +1,94 @@ +/* probe-example.c + * + * Connects two functions to marker call sites. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <asm/atomic.h> + +struct probe_data { + const char *name; + const char *format; + marker_probe_func *probe_func; +}; + +static void probe_subsystem_event(const struct marker *mdata, + void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Declare args */ + unsigned int value; + const char *mystr; + + /* Assign args */ + value = va_arg(*args, typeof(value)); + mystr = va_arg(*args, typeof(mystr)); + + /* Call printk */ + printk(KERN_INFO "Value %u, string %s\n", value, mystr); + + /* or count, check rights, serialize data in a buffer */ +} + +atomic_t eventb_count = ATOMIC_INIT(0); + +static void probe_subsystem_eventb(const struct marker *mdata, + void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +static struct probe_data probe_array[] = +{ + { .name = "subsystem_event", + .format = "integer %d string %s", + .probe_func = probe_subsystem_event }, + { .name = "subsystem_eventb", + .format = MARK_NOARGS, + .probe_func = probe_subsystem_eventb }, +}; + +static int __init probe_init(void) +{ + int result; + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) { + result = marker_probe_register("samples", probe_array[i].name, + probe_array[i].format, + probe_array[i].probe_func, &probe_array[i]); + if (result) + printk(KERN_INFO "Unable to register probe %s\n", + probe_array[i].name); + } + return 0; +} + +static void __exit probe_fini(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) + marker_probe_unregister("samples", probe_array[i].name, + probe_array[i].probe_func, &probe_array[i]); + printk(KERN_INFO "Number of event b : %u\n", + atomic_read(&eventb_count)); + marker_synchronize_unregister(); +} + +module_init(probe_init); +module_exit(probe_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("SUBSYSTEM Probe"); diff --git a/samples/markers/test-multi.c b/samples/markers/test-multi.c new file mode 100644 index 000000000000..9f62d2921b88 --- /dev/null +++ b/samples/markers/test-multi.c @@ -0,0 +1,116 @@ +/* test-multi.c + * + * Connects multiple callbacks. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <asm/atomic.h> + +struct probe_data { + const char *name; + const char *format; + marker_probe_func *probe_func; +}; + +atomic_t eventb_count = ATOMIC_INIT(0); + +void probe_subsystem_eventa(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +void probe_subsystem_eventb(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +void probe_subsystem_eventc(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +void probe_subsystem_eventd(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + /* Increment counter */ + atomic_inc(&eventb_count); +} + +static struct probe_data probe_array[] = +{ + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0xa }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0xb }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0xc }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0xd }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0x10 }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0x20 }, + { .name = "test_multi", + .format = MARK_NOARGS, + .probe_func = (marker_probe_func*)0x30 }, +}; + +static int __init probe_init(void) +{ + int result; + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) { + result = marker_probe_register("samples", probe_array[i].name, + probe_array[i].format, + probe_array[i].probe_func, (void*)(long)i); + if (result) + printk(KERN_INFO "Unable to register probe %s\n", + probe_array[i].name); + } + return 0; +} + +static void __exit probe_fini(void) +{ + int result; + int i; + + for (i = 0; i < ARRAY_SIZE(probe_array); i++) { + result = marker_probe_unregister("samples", probe_array[i].name, + probe_array[i].probe_func, (void*)(long)i); + if (result) + printk(KERN_INFO "Unable to unregister probe %s\n", + probe_array[i].name); + } + printk(KERN_INFO "Number of event b : %u\n", + atomic_read(&eventb_count)); + marker_synchronize_unregister(); +} + +module_init(probe_init); +module_exit(probe_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("SUBSYSTEM Probe"); diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 7d22056582c1..e0d2e063d910 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -13,6 +13,7 @@ # 2) modpost is then used to # 3) create one <module>.mod.c file pr. module # 4) create one Module.symvers file with CRC for all exported symbols +# 4a) [CONFIG_MARKERS] create one Module.markers file listing defined markers # 5) compile all <module>.mod.c files # 6) final link of the module to a <module.ko> file @@ -58,6 +59,10 @@ include scripts/Makefile.lib kernelsymfile := $(objtree)/Module.symvers modulesymfile := $(firstword $(KBUILD_EXTMOD))/Module.symvers +kernelmarkersfile := $(objtree)/Module.markers +modulemarkersfile := $(firstword $(KBUILD_EXTMOD))/Module.markers + +markersfile = $(if $(KBUILD_EXTMOD),$(modulemarkersfile),$(kernelmarkersfile)) # Step 1), find all modules listed in $(MODVERDIR)/ __modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod))) @@ -80,6 +85,8 @@ modpost = scripts/mod/modpost \ $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \ $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S) \ + $(if $(CONFIG_MARKERS),-K $(kernelmarkersfile)) \ + $(if $(CONFIG_MARKERS),-M $(markersfile)) \ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) \ $(if $(cross_build),-c) @@ -94,12 +101,17 @@ quiet_cmd_kernel-mod = MODPOST $@ cmd_kernel-mod = $(modpost) $@ vmlinux.o: FORCE + @rm -fr $(kernelmarkersfile) $(call cmd,kernel-mod) # Declare generated files as targets for modpost $(symverfile): __modpost ; $(modules:.ko=.mod.c): __modpost ; +ifdef CONFIG_MARKERS +$(markersfile): __modpost ; +endif + # Step 5), compile all *.mod.c files diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index e8fba959fffb..097d58047fe0 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -481,6 +481,8 @@ static int parse_elf(struct elf_info *info, const char *filename) info->export_unused_gpl_sec = i; else if (strcmp(secname, "__ksymtab_gpl_future") == 0) info->export_gpl_future_sec = i; + else if (strcmp(secname, "__markers_strings") == 0) + info->markers_strings_sec = i; if (sechdrs[i].sh_type == SHT_SYMTAB) { unsigned int sh_link_idx; @@ -798,6 +800,7 @@ static const char *section_white_list[] = ".note*", ".got*", ".toc*", + "__discard", NULL }; @@ -1640,6 +1643,62 @@ static void check_sec_ref(struct module *mod, const char *modname, } } +static void get_markers(struct elf_info *info, struct module *mod) +{ + const Elf_Shdr *sh = &info->sechdrs[info->markers_strings_sec]; + const char *strings = (const char *) info->hdr + sh->sh_offset; + const Elf_Sym *sym, *first_sym, *last_sym; + size_t n; + + if (!info->markers_strings_sec) + return; + + /* + * First count the strings. We look for all the symbols defined + * in the __markers_strings section named __mstrtab_*. For + * these local names, the compiler puts a random .NNN suffix on, + * so the names don't correspond exactly. + */ + first_sym = last_sym = NULL; + n = 0; + for (sym = info->symtab_start; sym < info->symtab_stop; sym++) + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT && + sym->st_shndx == info->markers_strings_sec && + !strncmp(info->strtab + sym->st_name, + "__mstrtab_", sizeof "__mstrtab_" - 1)) { + if (first_sym == NULL) + first_sym = sym; + last_sym = sym; + ++n; + } + + if (n == 0) + return; + + /* + * Now collect each name and format into a line for the output. + * Lines look like: + * marker_name vmlinux marker %s format %d + * The format string after the second \t can use whitespace. + */ + mod->markers = NOFAIL(malloc(sizeof mod->markers[0] * n)); + mod->nmarkers = n; + + n = 0; + for (sym = first_sym; sym <= last_sym; sym++) + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT && + sym->st_shndx == info->markers_strings_sec && + !strncmp(info->strtab + sym->st_name, + "__mstrtab_", sizeof "__mstrtab_" - 1)) { + const char *name = strings + sym->st_value; + const char *fmt = strchr(name, '\0') + 1; + char *line = NULL; + asprintf(&line, "%s\t%s\t%s\n", name, mod->name, fmt); + NOFAIL(line); + mod->markers[n++] = line; + } +} + static void read_symbols(char *modname) { const char *symname; @@ -1695,6 +1754,8 @@ static void read_symbols(char *modname) get_src_version(modname, mod->srcversion, sizeof(mod->srcversion)-1); + get_markers(&info, mod); + parse_elf_finish(&info); /* Our trick to get versioning for module struct etc. - it's @@ -2049,6 +2110,96 @@ static void write_dump(const char *fname) write_if_changed(&buf, fname); } +static void add_marker(struct module *mod, const char *name, const char *fmt) +{ + char *line = NULL; + asprintf(&line, "%s\t%s\t%s\n", name, mod->name, fmt); + NOFAIL(line); + + mod->markers = NOFAIL(realloc(mod->markers, ((mod->nmarkers + 1) * + sizeof mod->markers[0]))); + mod->markers[mod->nmarkers++] = line; +} + +static void read_markers(const char *fname) +{ + unsigned long size, pos = 0; + void *file = grab_file(fname, &size); + char *line; + + if (!file) /* No old markers, silently ignore */ + return; + + while ((line = get_next_line(&pos, file, size))) { + char *marker, *modname, *fmt; + struct module *mod; + + marker = line; + modname = strchr(marker, '\t'); + if (!modname) + goto fail; + *modname++ = '\0'; + fmt = strchr(modname, '\t'); + if (!fmt) + goto fail; + *fmt++ = '\0'; + if (*marker == '\0' || *modname == '\0') + goto fail; + + mod = find_module(modname); + if (!mod) { + mod = new_module(modname); + mod->skip = 1; + } + if (is_vmlinux(modname)) { + have_vmlinux = 1; + mod->skip = 0; + } + + if (!mod->skip) + add_marker(mod, marker, fmt); + } + release_file(file, size); + return; +fail: + fatal("parse error in markers list file\n"); +} + +static int compare_strings(const void *a, const void *b) +{ + return strcmp(*(const char **) a, *(const char **) b); +} + +static void write_markers(const char *fname) +{ + struct buffer buf = { }; + struct module *mod; + size_t i; + + for (mod = modules; mod; mod = mod->next) + if ((!external_module || !mod->skip) && mod->markers != NULL) { + /* + * Sort the strings so we can skip duplicates when + * we write them out. + */ + qsort(mod->markers, mod->nmarkers, + sizeof mod->markers[0], &compare_strings); + for (i = 0; i < mod->nmarkers; ++i) { + char *line = mod->markers[i]; + buf_write(&buf, line, strlen(line)); + while (i + 1 < mod->nmarkers && + !strcmp(mod->markers[i], + mod->markers[i + 1])) + free(mod->markers[i++]); + free(mod->markers[i]); + } + free(mod->markers); + mod->markers = NULL; + } + + write_if_changed(&buf, fname); +} + struct ext_sym_list { struct ext_sym_list *next; const char *file; @@ -2060,6 +2211,8 @@ int main(int argc, char **argv) struct buffer buf = { }; char *kernel_read = NULL, *module_read = NULL; char *dump_write = NULL; + char *markers_read = NULL; + char *markers_write = NULL; int opt; int err; struct ext_sym_list *extsym_iter; @@ -2103,6 +2256,12 @@ int main(int argc, char **argv) case 'w': warn_unresolved = 1; break; + case 'M': + markers_write = optarg; + break; + case 'K': + markers_read = optarg; + break; default: exit(1); } @@ -2157,5 +2316,11 @@ int main(int argc, char **argv) "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", sec_mismatch_count); + if (markers_read) + read_markers(markers_read); + + if (markers_write) + write_markers(markers_write); + return err; } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 0388cfccac8d..dbde650961e5 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -112,6 +112,8 @@ struct module { int has_init; int has_cleanup; struct buffer dev_table_buf; + char **markers; + size_t nmarkers; char srcversion[25]; }; @@ -126,6 +128,7 @@ struct elf_info { Elf_Section export_gpl_sec; Elf_Section export_unused_gpl_sec; Elf_Section export_gpl_future_sec; + Elf_Section markers_strings_sec; const char *strtab; char *modinfo; unsigned int modinfo_len; |