diff options
Diffstat (limited to 'arch/x86')
48 files changed, 885 insertions, 289 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aa..b0389519b6d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,7 @@ config X86 select HAVE_READQ select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_GET_CYCLES select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS @@ -27,9 +28,11 @@ config X86 select HAVE_KPROBES select HAVE_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB + select HAVE_LTT_DUMP_TABLES select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_TRACE_CLOCK select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_C_RECORDMCOUNT @@ -208,10 +211,12 @@ config HAVE_INTEL_TXT config X86_32_SMP def_bool y depends on X86_32 && SMP + select HAVE_UNSYNCHRONIZED_TSC config X86_64_SMP def_bool y depends on X86_64 && SMP + select HAVE_UNSYNCHRONIZED_TSC config X86_HT def_bool y diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index 29cdcd02ead..accd6b42bd2 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -8,8 +8,11 @@ #include <linux/shm.h> #include <linux/ipc.h> #include <linux/compat.h> +#include <trace/ipc.h> #include <asm/sys_ia32.h> +DEFINE_TRACE(ipc_call); + asmlinkage long sys32_ipc(u32 call, int first, int second, int third, compat_uptr_t ptr, u32 fifth) { @@ -18,6 +21,8 @@ asmlinkage long sys32_ipc(u32 call, int first, int second, int third, version = call >> 16; /* hack for backward compatibility */ call &= 0xffff; + trace_ipc_call(call, first); + switch (call) { case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 1f11ce44e95..d09bb03653f 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -37,7 +37,7 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) @@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { + int ret = 0; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) /* Real i386 machines have no cmpxchg instruction */ @@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, return -ENOSYS; #endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" "2:\t.section .fixup, \"ax\"\n" - "3:\tmov %2, %0\n" + "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" _ASM_EXTABLE(1b, 3b) - : "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "1" (oldval) : "memory" ); - return oldval; + *uval = oldval; + return ret; } #endif diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 38d87379e27..9b1db108f9e 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -1,20 +1,9 @@ #ifndef _ASM_X86_IDLE_H #define _ASM_X86_IDLE_H -#define IDLE_START 1 -#define IDLE_END 2 - -struct notifier_block; -void idle_notifier_register(struct notifier_block *n); -void idle_notifier_unregister(struct notifier_block *n); - -#ifdef CONFIG_X86_64 -void enter_idle(void); -void exit_idle(void); -#else /* !CONFIG_X86_64 */ -static inline void enter_idle(void) { } -static inline void exit_idle(void) { } -#endif /* CONFIG_X86_64 */ +extern void enter_idle(void); +extern void __exit_idle(void); +extern void exit_idle(void); void c1e_remove_cpu(int cpu); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 5745ce8bf10..fdf897373e1 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -56,6 +56,61 @@ static inline void native_halt(void) #endif +#ifdef CONFIG_X86_64 +/* + * Only returns from a trap or exception to a NMI context (intra-privilege + * level near return) to the same SS and CS segments. Should be used + * upon trap or exception return when nested over a NMI context so no iret is + * issued. It takes care of modifying the eflags, rsp and returning to the + * previous function. + * + * The stack, at that point, looks like : + * + * 0(rsp) RIP + * 8(rsp) CS + * 16(rsp) EFLAGS + * 24(rsp) RSP + * 32(rsp) SS + * + * Upon execution : + * Copy EIP to the top of the return stack + * Update top of return stack address + * Pop eflags into the eflags register + * Make the return stack current + * Near return (popping the return address from the return stack) + */ +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushq %rax; \ + movq %rsp, %rax; \ + movq 24+8(%rax), %rsp; \ + pushq 0+8(%rax); \ + pushq 16+8(%rax); \ + movq (%rax), %rax; \ + popfq; \ + ret +#else +/* + * Protected mode only, no V8086. Implies that protected mode must + * be entered before NMIs or MCEs are enabled. Only returns from a trap or + * exception to a NMI context (intra-privilege level far return). Should be used + * upon trap or exception return when nested over a NMI context so no iret is + * issued. + * + * The stack, at that point, looks like : + * + * 0(esp) EIP + * 4(esp) CS + * 8(esp) EFLAGS + * + * Upon execution : + * Copy the stack eflags to top of stack + * Pop eflags into the eflags register + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS. + */ +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \ + popfl; \ + lret $4 +#endif + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -112,6 +167,7 @@ static inline unsigned long arch_local_irq_save(void) #define ENABLE_INTERRUPTS(x) sti #define DISABLE_INTERRUPTS(x) cli +#define INTERRUPT_RETURN_NMI_SAFE NATIVE_INTERRUPT_RETURN_NMI_SAFE #ifdef CONFIG_X86_64 #define SWAPGS swapgs diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/include/asm/kvm-mmutrace.h index b60b4fdb3ed..42d117d0418 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/include/asm/kvm-mmutrace.h @@ -217,9 +217,9 @@ TRACE_EVENT( #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH asm #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace +#define TRACE_INCLUDE_FILE kvm-mmutrace /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/trace.h b/arch/x86/include/asm/kvm-trace.h index 1357d7cf4ec..c1e151c092b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/include/asm/kvm-trace.h @@ -701,9 +701,9 @@ TRACE_EVENT(kvm_emulate_insn, #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_PATH asm #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace +#define TRACE_INCLUDE_FILE kvm-trace /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ebbc4d8ab17..1ef6906c179 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -962,6 +962,10 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) +#define INTERRUPT_RETURN_NMI_SAFE \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE, \ + jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return) + #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c86..3e0634cc127 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -181,6 +181,7 @@ struct pv_cpu_ops { /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); + void (*nmi_return)(void); void (*swapgs)(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a..58a37ae7565 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -82,6 +82,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_KERNEL_TRACE 9 /* kernel trace active */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -105,6 +106,7 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -121,18 +123,19 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_KERNEL_TRACE) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ (0x0000FFFF & \ ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) + _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU|_TIF_KERNEL_TRACE)) /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h new file mode 100644 index 00000000000..8ca73323366 --- /dev/null +++ b/arch/x86/include/asm/trace-clock.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +/* + * linux/arch/x86/include/asm/trace-clock.h + * + * Copyright (C) 2005,2006,2008 + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Trace clock definitions for x86. + */ + +#include <linux/timex.h> +#include <linux/time.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +/* Minimum duration of a probe, in cycles */ +#define TRACE_CLOCK_MIN_PROBE_DURATION 200 +#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION + +union lttng_timespec { + struct timespec ts; + u64 lttng_ts; +}; + +extern cycles_t trace_clock_async_tsc_read(void); + +extern int _trace_clock_is_sync; +static inline int trace_clock_is_sync(void) +{ + return _trace_clock_is_sync; +} + +static inline u32 trace_clock_read32(void) +{ + u32 cycles; + + if (likely(trace_clock_is_sync())) + cycles = (u32)get_cycles(); /* only need the 32 LSB */ + else + cycles = (u32)trace_clock_async_tsc_read(); + return cycles; +} + +static inline u64 trace_clock_read64(void) +{ + u64 cycles; + + if (likely(trace_clock_is_sync())) + cycles = get_cycles(); + else + cycles = trace_clock_async_tsc_read(); + return cycles; +} + +static inline u64 trace_clock_frequency(void) +{ + return (u64)cpu_khz * 1000; +} + +static inline u32 trace_clock_freq_scale(void) +{ + return 1; +} + +extern int get_trace_clock(void); +extern void put_trace_clock(void); + +extern void set_trace_clock_is_sync(int state); + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 1ca132fc0d0..28e56e1ec3c 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,18 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +static inline cycles_t get_cycles_rate(void) +{ + if (check_tsc_unstable()) + return 0; + return (cycles_t)tsc_khz * 1000; +} + +static inline void get_cycles_barrier(void) +{ + rdtsc_barrier(); +} + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: @@ -62,4 +74,10 @@ extern int notsc_setup(char *); extern void save_sched_clock_state(void); extern void restore_sched_clock_state(void); +extern int test_tsc_synchronization(void); +extern int _tsc_is_sync; +static inline int tsc_is_sync(void) +{ + return _tsc_is_sync; +} #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e204826..06abe8f409a 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -12,6 +12,7 @@ struct vsyscall_gtod_data { u32 wall_time_nsec; int sysctl_enabled; + int trace_clock_is_sync; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d0983d255fb..47b80f3ba4d 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -39,6 +39,14 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +#ifdef CONFIG_X86_64 +extern void update_trace_clock_is_sync_vdso(void); +#else +static inline void update_trace_clock_is_sync_vdso(void) +{ +} +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34244b2cd88..717cf9c620b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,6 +46,7 @@ obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-$(CONFIG_HAVE_TRACE_CLOCK) += trace-clock.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -66,9 +67,8 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 76b96d74978..c604d23b4f3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -33,6 +33,7 @@ #include <linux/dmi.h> #include <linux/smp.h> #include <linux/mm.h> +#include <trace/irq.h> #include <asm/perf_event.h> #include <asm/x86_init.h> @@ -868,7 +869,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) */ exit_idle(); irq_enter(); + trace_irq_entry(LOCAL_TIMER_VECTOR, regs, NULL); local_apic_timer_interrupt(); + trace_irq_exit(IRQ_HANDLED); irq_exit(); set_irq_regs(old_regs); @@ -1788,6 +1791,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); + trace_irq_entry(SPURIOUS_APIC_VECTOR, NULL, NULL); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1802,6 +1806,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); + trace_irq_exit(IRQ_HANDLED); irq_exit(); } @@ -1814,6 +1819,7 @@ void smp_error_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); + trace_irq_entry(ERROR_APIC_VECTOR, NULL, NULL); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1834,6 +1840,7 @@ void smp_error_interrupt(struct pt_regs *regs) */ pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); + trace_irq_exit(IRQ_HANDLED); irq_exit(); } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0e4f24c2a74..60939d5f226 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/idle.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -235,6 +236,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/idle.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -947,10 +949,17 @@ recalc: break; } } + enter_idle(); if (original_pm_idle) original_pm_idle(); else default_idle(); + /* + * In many cases the interrupt that ended idle + * has already called exit_idle. But some idle + * loops can be woken up without interrupt. + */ + __exit_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37..677f8475d9d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -111,6 +111,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return); OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd96..1aea11cd840 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -58,6 +58,7 @@ int main(void) OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396b..6052f6f65a6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1069,6 +1069,7 @@ unsigned long kernel_eflags; * debugging, no special alignment required. */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +EXPORT_PER_CPU_SYMBOL_GPL(orig_ist); #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97..c8a6411d8ba 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <trace/irq.h> #include <asm/processor.h> #include <asm/system.h> @@ -402,8 +403,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { exit_idle(); irq_enter(); + trace_irq_entry(THERMAL_APIC_VECTOR, regs, NULL); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); + trace_irq_exit(IRQ_HANDLED); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index df20723a6a1..6bed23e1c74 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,6 +15,7 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> +#include <linux/ltt-core.h> #include <asm/stacktrace.h> @@ -253,6 +254,8 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (!signr) return; + if (in_nmi()) + panic("Fatal exception in non-maskable interrupt"); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) @@ -277,6 +280,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); +#ifdef CONFIG_LTT + printk(KERN_EMERG "LTT NESTING LEVEL : %u", __get_cpu_var(ltt_nesting)); + printk("\n"); +#endif sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9ca3b0e343e..afd6d8ef78c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -80,6 +80,8 @@ #define nr_syscalls ((syscall_table_size)/4) +#define NMI_MASK 0x04000000 + #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -321,8 +323,32 @@ END(ret_from_fork) # userspace resumption stub bypassing syscall exit tracing ALIGN RING0_PTREGS_FRAME + ret_from_exception: preempt_stop(CLBR_ANY) + GET_THREAD_INFO(%ebp) + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jae resume_userspace # returning to v8086 or userspace + testl $NMI_MASK,TI_preempt_count(%ebp) + jz resume_kernel /* Not nested over NMI ? */ + testw $X86_EFLAGS_TF, PT_EFLAGS(%esp) + jnz resume_kernel /* + * If single-stepping an NMI handler, + * use the normal iret path instead of + * the popf/lret because lret would be + * single-stepped. It should not + * happen : it will reactivate NMIs + * prematurely. + */ + TRACE_IRQS_IRET + RESTORE_REGS + addl $4, %esp # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 + INTERRUPT_RETURN_NMI_SAFE + ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -906,6 +932,10 @@ ENTRY(native_iret) .previous END(native_iret) +ENTRY(native_nmi_return) + NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ? +END(native_nmi_return) + ENTRY(native_irq_enable_sysexit) sti sysexit diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bbd5c80cb09..7800ff65aab 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -163,6 +163,8 @@ GLOBAL(return_to_handler) #endif +#define NMI_MASK 0x04000000 + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -515,6 +517,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + testl $_TIF_KERNEL_TRACE,%edx /* Re-read : concurrently changed */ + jnz ret_from_sys_call_trace bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -524,6 +528,16 @@ sysret_careful: popq_cfi %rdi jmp sysret_check +ret_from_sys_call_trace: + TRACE_IRQS_ON + sti + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp int_ret_from_sys_call + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON @@ -872,6 +886,9 @@ ENTRY(native_iret) .section __ex_table,"a" .quad native_iret, bad_iret .previous + +ENTRY(native_nmi_return) + NATIVE_INTERRUPT_RETURN_NMI_SAFE #endif .section .fixup,"ax" @@ -924,6 +941,24 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule + /* Returning to kernel space from exception. */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retexc_kernel) + testl $NMI_MASK,TI_preempt_count(%rcx) + jz retint_kernel /* Not nested over NMI ? */ + testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */ + jnz retint_kernel /* + * If single-stepping an NMI handler, + * use the normal iret path instead of + * the popf/lret because lret would be + * single-stepped. It should not + * happen : it will reactivate NMIs + * prematurely. + */ + RESTORE_ARGS 0,8,0 + TRACE_IRQS_IRETQ + INTERRUPT_RETURN_NMI_SAFE + #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ @@ -1361,12 +1396,18 @@ ENTRY(paranoid_exit) paranoid_swapgs: TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK +paranoid_restore_no_nmi: RESTORE_ALL 8 jmp irq_return paranoid_restore: + GET_THREAD_INFO(%rcx) TRACE_IRQS_IRETQ 0 + testl $NMI_MASK,TI_preempt_count(%rcx) + jz paranoid_restore_no_nmi /* Nested over NMI ? */ + testw $X86_EFLAGS_TF,EFLAGS-0(%rsp) /* trap flag? */ + jnz paranoid_restore_no_nmi RESTORE_ALL 8 - jmp irq_return + INTERRUPT_RETURN_NMI_SAFE paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx @@ -1465,7 +1506,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jne retexc_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71..1fc5da98373 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -156,6 +156,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) @@ -204,6 +205,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); +extern void native_nmi_return(void); extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); @@ -373,6 +375,7 @@ struct pv_cpu_ops pv_cpu_ops = { .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, + .nmi_return = native_nmi_return, .swapgs = native_swapgs, .set_iopl_mask = native_set_iopl_mask, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab..ac372778bbc 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -1,10 +1,13 @@ -#include <asm/paravirt.h> +#include <linux/stringify.h> +#include <linux/irqflags.h> DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, nmi_return, + __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE)); DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); @@ -41,6 +44,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, nmi_return); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34f93e..5339e67dc15 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -1,12 +1,15 @@ +#include <linux/irqflags.h> +#include <linux/stringify.h> #include <asm/paravirt.h> #include <asm/asm-offsets.h> -#include <linux/stringify.h> DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_cpu_ops, iret, "iretq"); +DEF_NATIVE(pv_cpu_ops, nmi_return, + __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE)); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -51,6 +54,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, nmi_return); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff455419898..e0e4ffcad48 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -13,6 +13,7 @@ #include <linux/dmi.h> #include <linux/utsname.h> #include <trace/events/power.h> +#include <trace/sched.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> #include <asm/system.h> @@ -23,6 +24,8 @@ #include <asm/i387.h> #include <asm/debugreg.h> +DEFINE_TRACE(sched_kthread_create); + struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -278,6 +281,7 @@ extern void kernel_thread_helper(void); int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; + long pid; memset(®s, 0, sizeof(regs)); @@ -299,7 +303,10 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, + 0, ®s, 0, NULL, NULL); + trace_sched_kthread_create(fn, pid); + return pid; } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af4..256ba23211d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,9 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/notifier.h> +#include <linux/idle.h> +#include <trace/pm.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -59,6 +62,38 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +DEFINE_TRACE(pm_idle_exit); +DEFINE_TRACE(pm_idle_entry); + +static DEFINE_PER_CPU(unsigned char, is_idle); + +void enter_idle(void) +{ + percpu_write(is_idle, 1); + trace_pm_idle_entry(); + notify_idle(IDLE_START); +} +EXPORT_SYMBOL_GPL(enter_idle); + +void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + notify_idle(IDLE_END); + trace_pm_idle_exit(); +} +EXPORT_SYMBOL_GPL(__exit_idle); + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +EXPORT_SYMBOL_GPL(exit_idle); + /* * Return saved PC of a blocked thread. */ @@ -107,10 +142,18 @@ void cpu_idle(void) play_dead(); local_irq_disable(); + enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); pm_idle(); start_critical_timings(); + + /* + * In many cases the interrupt that ended idle + * has already called exit_idle. But some idle + * loops can be woken up without interrupt. + */ + __exit_idle(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b..fbde94f1447 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -35,8 +35,10 @@ #include <linux/tick.h> #include <linux/prctl.h> #include <linux/uaccess.h> +#include <linux/idle.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <trace/pm.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -51,37 +53,34 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> +DEFINE_TRACE(pm_idle_exit); +DEFINE_TRACE(pm_idle_entry); + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - void enter_idle(void) { percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); + /* + * Trace last event before calling notifiers. Notifiers flush + * data from buffers before going to idle. + */ + trace_pm_idle_entry(); + notify_idle(IDLE_START); } +EXPORT_SYMBOL_GPL(enter_idle); -static void __exit_idle(void) +void __exit_idle(void) { if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); + notify_idle(IDLE_END); + trace_pm_idle_exit(); } +EXPORT_SYMBOL_GPL(__exit_idle); /* Called from interrupts to signify idle end */ void exit_idle(void) @@ -91,6 +90,7 @@ void exit_idle(void) return; __exit_idle(); } +EXPORT_SYMBOL_GPL(exit_idle); #ifndef CONFIG_SMP static inline void play_dead(void) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc4b72..ee3024d4f61 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> +#include <trace/syscall.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -152,6 +153,9 @@ static const int arg_offs_table[] = { X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) +DEFINE_TRACE(syscall_entry); +DEFINE_TRACE(syscall_exit); + /* * Determines whether a value may be installed in a segment register. */ @@ -1361,6 +1365,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) if (test_thread_flag(TIF_SINGLESTEP)) regs->flags |= X86_EFLAGS_TF; + trace_syscall_entry(regs, regs->orig_ax); + /* do the secure computing check first */ secure_computing(regs->orig_ax); @@ -1396,6 +1402,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) { bool step; + trace_syscall_exit(regs->ax); + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d600829..5e74f6aa3c0 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -1,8 +1,11 @@ /* System call table for x86-64. */ #include <linux/linkage.h> +#include <linux/module.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/marker.h> +#include <linux/kallsyms.h> #include <asm/asm-offsets.h> #define __NO_STUBS @@ -27,3 +30,18 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/unistd_64.h> }; + +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < __NR_syscall_max + 1; i++) { + sprint_symbol(namebuf, (unsigned long)sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, + call_data, + "id %d address %p symbol %s", + i, (void*)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c new file mode 100644 index 00000000000..47539e28276 --- /dev/null +++ b/arch/x86/kernel/trace-clock.c @@ -0,0 +1,302 @@ +/* + * arch/x86/kernel/trace-clock.c + * + * Trace clock for x86. + * + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>, October 2008 + */ + +#include <linux/module.h> +#include <linux/trace-clock.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <linux/cpu.h> +#include <linux/posix-timers.h> +#include <asm/vgtod.h> + +static cycles_t trace_clock_last_tsc; +static DEFINE_PER_CPU(struct timer_list, update_timer); +static DEFINE_SPINLOCK(async_tsc_lock); +static int async_tsc_refcount; /* Number of readers */ +static int async_tsc_enabled; /* Async TSC enabled on all online CPUs */ + +int _trace_clock_is_sync = 1; +EXPORT_SYMBOL_GPL(_trace_clock_is_sync); + +/* + * Is the trace clock being used by user-space ? We leave the trace clock active + * as soon as user-space starts using it. We never unref the trace clock + * reference taken by user-space. + */ +static atomic_t user_trace_clock_ref; + +/* + * Called by check_tsc_sync_source from CPU hotplug. + */ +void set_trace_clock_is_sync(int state) +{ + _trace_clock_is_sync = state; + update_trace_clock_is_sync_vdso(); +} + +#if BITS_PER_LONG == 64 +static cycles_t read_last_tsc(void) +{ + return trace_clock_last_tsc; +} +#else +/* + * A cmpxchg64 update can happen concurrently. Based on the assumption that + * two cmpxchg64 will never update it to the same value (the count always + * increases), reading it twice insures that we read a coherent value with the + * same "sequence number". + */ +static cycles_t read_last_tsc(void) +{ + cycles_t val1, val2; + + val1 = trace_clock_last_tsc; + for (;;) { + val2 = val1; + barrier(); + val1 = trace_clock_last_tsc; + if (likely(val1 == val2)) + break; + } + return val1; +} +#endif + +/* + * Support for architectures with non-sync TSCs. + * When the local TSC is discovered to lag behind the highest TSC counter, we + * increment the TSC count of an amount that should be, ideally, lower than the + * execution time of this routine, in cycles : this is the granularity we look + * for : we must be able to order the events. + */ +notrace cycles_t trace_clock_async_tsc_read(void) +{ + cycles_t new_tsc, last_tsc; + + WARN_ON(!async_tsc_refcount || !async_tsc_enabled); + new_tsc = get_cycles(); + last_tsc = read_last_tsc(); + do { + if (new_tsc < last_tsc) + new_tsc = last_tsc + TRACE_CLOCK_MIN_PROBE_DURATION; + /* + * If cmpxchg fails with a value higher than the new_tsc, don't + * retry : the value has been incremented and the events + * happened almost at the same time. + * We must retry if cmpxchg fails with a lower value : + * it means that we are the CPU with highest frequency and + * therefore MUST update the value. + */ + last_tsc = cmpxchg64(&trace_clock_last_tsc, last_tsc, new_tsc); + } while (unlikely(last_tsc < new_tsc)); + return new_tsc; +} +EXPORT_SYMBOL_GPL(trace_clock_async_tsc_read); + +static void update_timer_ipi(void *info) +{ + (void)trace_clock_async_tsc_read(); +} + +/* + * update_timer_fct : - Timer function to resync the clocks + * @data: unused + * + * Fires every jiffy. + */ +static void update_timer_fct(unsigned long data) +{ + (void)trace_clock_async_tsc_read(); + mod_timer_pinned(&per_cpu(update_timer, smp_processor_id()), + jiffies + 1); +} + +static void enable_trace_clock(int cpu) +{ + init_timer(&per_cpu(update_timer, cpu)); + per_cpu(update_timer, cpu).function = update_timer_fct; + per_cpu(update_timer, cpu).expires = jiffies + 1; + smp_call_function_single(cpu, update_timer_ipi, NULL, 1); + add_timer_on(&per_cpu(update_timer, cpu), cpu); +} + +static void disable_trace_clock(int cpu) +{ + del_timer_sync(&per_cpu(update_timer, cpu)); +} + +/* + * hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + int cpu; + + spin_lock(&async_tsc_lock); + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + /* + * trace_clock_is_sync() is updated by set_trace_clock_is_sync() + * code, protected by cpu hotplug disable. + * It is ok to let the hotplugged CPU read the timebase before + * the CPU_ONLINE notification. It's just there to give a + * maximum bound to the TSC error. + */ + if (async_tsc_refcount && !trace_clock_is_sync()) { + if (!async_tsc_enabled) { + async_tsc_enabled = 1; + for_each_online_cpu(cpu) + enable_trace_clock(cpu); + } else { + enable_trace_clock(hotcpu); + } + } + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* + * We cannot stop the trace clock on other CPUs when readers are + * active even if we go back to a synchronized state (1 CPU) + * because the CPU left could be the one lagging behind. + */ + if (async_tsc_refcount && async_tsc_enabled) + disable_trace_clock(hotcpu); + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + spin_unlock(&async_tsc_lock); + + return NOTIFY_OK; +} + +int get_trace_clock(void) +{ + int cpu; + + if (!trace_clock_is_sync()) { + printk(KERN_WARNING + "Trace clock falls back on cache-line bouncing\n" + "workaround due to non-synchronized TSCs.\n" + "This workaround preserves event order across CPUs.\n" + "Please consider disabling Speedstep or PowerNow and\n" + "using kernel parameters " + "\"force_tsc_sync=1 idle=poll\"\n" + "for accurate and fast tracing clock source.\n"); + } + + get_online_cpus(); + spin_lock(&async_tsc_lock); + if (async_tsc_refcount++ || trace_clock_is_sync()) + goto end; + + async_tsc_enabled = 1; + for_each_online_cpu(cpu) + enable_trace_clock(cpu); +end: + spin_unlock(&async_tsc_lock); + put_online_cpus(); + return 0; +} +EXPORT_SYMBOL_GPL(get_trace_clock); + +void put_trace_clock(void) +{ + int cpu; + + get_online_cpus(); + spin_lock(&async_tsc_lock); + WARN_ON(async_tsc_refcount <= 0); + if (async_tsc_refcount != 1 || !async_tsc_enabled) + goto end; + + for_each_online_cpu(cpu) + disable_trace_clock(cpu); + async_tsc_enabled = 0; +end: + async_tsc_refcount--; + if (!async_tsc_refcount && num_online_cpus() == 1) + set_trace_clock_is_sync(1); + spin_unlock(&async_tsc_lock); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(put_trace_clock); + +static int posix_get_trace(clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + int ret; + + /* + * Yes, there is a race here that would lead to refcount being + * incremented more than once, but all we care is to leave the trace + * clock active forever, so precise accounting is not needed. + */ + if (unlikely(!atomic_read(&user_trace_clock_ref))) { + ret = get_trace_clock(); + if (ret) + return ret; + atomic_inc(&user_trace_clock_ref); + } + lts->lttng_ts = trace_clock_read64(); + return 0; +} + +static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + + lts->lttng_ts = trace_clock_frequency(); + return 0; +} + +static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp) +{ + union lttng_timespec *lts = (union lttng_timespec *) tp; + + lts->lttng_ts = TRACE_CLOCK_RES; + return 0; +} + +static __init int init_unsync_trace_clock(void) +{ + struct k_clock clock_trace = { + .clock_getres = posix_get_trace_res, + .clock_get = posix_get_trace, + }; + struct k_clock clock_trace_freq = { + .clock_getres = posix_get_trace_res, + .clock_get = posix_get_trace_freq, + }; + + register_posix_clock(CLOCK_TRACE, &clock_trace); + register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq); + + hotcpu_notifier(hotcpu_callback, 4); + return 0; +} +early_initcall(init_unsync_trace_clock); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b67166f9d..d45030679f9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <trace/trap.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -52,6 +53,7 @@ #include <asm/atomic.h> #include <asm/system.h> #include <asm/traps.h> +#include <asm/unistd.h> #include <asm/desc.h> #include <asm/i387.h> #include <asm/mce.h> @@ -76,11 +78,21 @@ char ignore_fpu_irq; * F0 0F bug workaround. */ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; + +extern unsigned long sys_call_table[]; +extern unsigned long syscall_table_size; + #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); +/* + * Also used in arch/x86/mm/fault.c. + */ +DEFINE_TRACE(trap_entry); +DEFINE_TRACE(trap_exit); + static int ignore_nmis; int unknown_nmi_panic; @@ -122,6 +134,8 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; + trace_trap_entry(regs, trapnr); + #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* @@ -168,7 +182,7 @@ trap_signal: force_sig_info(signr, info, tsk); else force_sig(signr, tsk); - return; + goto end; kernel_trap: if (!fixup_exception(regs)) { @@ -176,15 +190,17 @@ kernel_trap: tsk->thread.trap_no = trapnr; die(str, regs, error_code); } - return; + goto end; #ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; - return; + goto end; #endif +end: + trace_trap_exit(); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -285,7 +301,9 @@ do_general_protection(struct pt_regs *regs, long error_code) printk("\n"); } + trace_trap_entry(regs, 13); force_sig(SIGSEGV, tsk); + trace_trap_exit(); return; #ifdef CONFIG_X86_32 @@ -371,9 +389,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { + trace_trap_entry(regs, 2); + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; + goto end; #ifdef CONFIG_MCA /* * Might actually be able to figure out what the guilty party @@ -381,7 +401,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) */ if (MCA_bus) { mca_handle_nmi(); - return; + goto end; } #endif pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", @@ -392,19 +412,23 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) panic("NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); +end: + trace_trap_exit(); } static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + trace_trap_entry(regs, 2); + /* * CPU-specific NMI must be processed before non-CPU-specific * NMI, otherwise we may lose it, because the CPU-specific * NMI can not be detected/processed on other CPUs. */ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; + goto end; /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ raw_spin_lock(&nmi_reason_lock); @@ -423,11 +447,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); #endif raw_spin_unlock(&nmi_reason_lock); - return; + goto end; } raw_spin_unlock(&nmi_reason_lock); unknown_nmi_error(reason, regs); +end: + trace_trap_exit(); } dotraplinkage notrace __kprobes void @@ -570,8 +596,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { + trace_trap_entry(regs, 1); handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + trace_trap_exit(); preempt_conditional_cli(regs); return; } @@ -589,13 +617,32 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) regs->flags &= ~X86_EFLAGS_TF; } si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) { + trace_trap_entry(regs, 1); send_sigtrap(tsk, regs, error_code, si_code); + trace_trap_exit(); + } preempt_conditional_cli(regs); return; } +#ifdef CONFIG_X86_32 +void ltt_dump_sys_call_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < NR_syscalls; i++) { + sprint_symbol(namebuf, sys_call_table[i]); + __trace_mark(0, syscall_state, sys_call_table, call_data, + "id %d address %p symbol %s", + i, (void*)sys_call_table[i], namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table); +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -701,11 +748,13 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + trace_trap_entry(regs, 16); conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif + trace_trap_exit(); } asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) @@ -738,6 +787,21 @@ void __math_state_restore(void) tsk->fpu_counter++; } +void ltt_dump_idt_table(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < IDT_ENTRIES; i++) { + unsigned long address = gate_offset(idt_table[i]); + sprint_symbol(namebuf, address); + __trace_mark(0, irq_state, idt_table, call_data, + "irq %d address %p symbol %s", + i, (void *)address, namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_idt_table); + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c deleted file mode 100644 index 0aa5fed8b9e..00000000000 --- a/arch/x86/kernel/tsc_sync.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * check TSC synchronization. - * - * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar - * - * We check whether all boot CPUs have their TSC's synchronized, - * print a warning if not and turn off the TSC clock-source. - * - * The warp-check is point-to-point between two CPUs, the CPU - * initiating the bootup is the 'source CPU', the freshly booting - * CPU is the 'target CPU'. - * - * Only two CPUs may participate - they can enter in any order. - * ( The serial nature of the boot logic and the CPU hotplug lock - * protects against more than 2 CPUs entering this code. ) - */ -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/nmi.h> -#include <asm/tsc.h> - -/* - * Entry/exit counters that make sure that both CPUs - * run the measurement code at once: - */ -static __cpuinitdata atomic_t start_count; -static __cpuinitdata atomic_t stop_count; - -/* - * We use a raw spinlock in this exceptional case, because - * we want to have the fastest, inlined, non-debug version - * of a critical section, to be able to prove TSC time-warps: - */ -static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; - -static __cpuinitdata cycles_t last_tsc; -static __cpuinitdata cycles_t max_warp; -static __cpuinitdata int nr_warps; - -/* - * TSC-warp measurement loop running on both CPUs: - */ -static __cpuinit void check_tsc_warp(void) -{ - cycles_t start, now, prev, end; - int i; - - rdtsc_barrier(); - start = get_cycles(); - rdtsc_barrier(); - /* - * The measurement runs for 20 msecs: - */ - end = start + tsc_khz * 20ULL; - now = start; - - for (i = 0; ; i++) { - /* - * We take the global lock, measure TSC, save the - * previous TSC that was measured (possibly on - * another CPU) and update the previous TSC timestamp. - */ - arch_spin_lock(&sync_lock); - prev = last_tsc; - rdtsc_barrier(); - now = get_cycles(); - rdtsc_barrier(); - last_tsc = now; - arch_spin_unlock(&sync_lock); - - /* - * Be nice every now and then (and also check whether - * measurement is done [we also insert a 10 million - * loops safety exit, so we dont lock up in case the - * TSC readout is totally broken]): - */ - if (unlikely(!(i & 7))) { - if (now > end || i > 10000000) - break; - cpu_relax(); - touch_nmi_watchdog(); - } - /* - * Outside the critical section we can now see whether - * we saw a time-warp of the TSC going backwards: - */ - if (unlikely(prev > now)) { - arch_spin_lock(&sync_lock); - max_warp = max(max_warp, prev - now); - nr_warps++; - arch_spin_unlock(&sync_lock); - } - } - WARN(!(now-start), - "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", - now-start, end-start); -} - -/* - * Source CPU calls into this - it waits for the freshly booted - * target CPU to arrive and then starts the measurement: - */ -void __cpuinit check_tsc_sync_source(int cpu) -{ - int cpus = 2; - - /* - * No need to check if we already know that the TSC is not - * synchronized: - */ - if (unsynchronized_tsc()) - return; - - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - - /* - * Reset it - in case this is a second bootup: - */ - atomic_set(&stop_count, 0); - - /* - * Wait for the target to arrive: - */ - while (atomic_read(&start_count) != cpus-1) - cpu_relax(); - /* - * Trigger the target to continue into the measurement too: - */ - atomic_inc(&start_count); - - check_tsc_warp(); - - while (atomic_read(&stop_count) != cpus-1) - cpu_relax(); - - if (nr_warps) { - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", - smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); - mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); - } - - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; - - /* - * Let the target continue with the bootup: - */ - atomic_inc(&stop_count); -} - -/* - * Freshly booted CPUs call into this: - */ -void __cpuinit check_tsc_sync_target(void) -{ - int cpus = 2; - - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) - return; - - /* - * Register this CPU's participation and wait for the - * source CPU to start the measurement: - */ - atomic_inc(&start_count); - while (atomic_read(&start_count) != cpus) - cpu_relax(); - - check_tsc_warp(); - - /* - * Ok, we are done: - */ - atomic_inc(&stop_count); - - /* - * Wait for the source CPU to print stuff: - */ - while (atomic_read(&stop_count) != cpus) - cpu_relax(); -} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c4b69..df18f14c473 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,6 +44,8 @@ #include <asm/desc.h> #include <asm/topology.h> #include <asm/vgtod.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #define __vsyscall(nr) \ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace @@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, + .trace_clock_is_sync = 1, }; void update_vsyscall_tz(void) @@ -73,6 +76,16 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } +void update_trace_clock_is_sync_vdso(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} +EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso); + void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { @@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac..f894af174b8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,7 +32,7 @@ #include "irq.h" #include <linux/kvm_host.h> -#include "trace.h" +#include <asm/kvm-trace.h> static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d365..58bcbce5b02 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,7 +36,7 @@ #include <asm/atomic.h> #include "kvm_cache_regs.h" #include "irq.h" -#include "trace.h" +#include <asm/kvm-trace.h> #include "x86.h" #ifndef CONFIG_X86_64 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d4..3612044ed1f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -163,7 +163,7 @@ module_param(oos_shadow, bool, 0644); #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS -#include "mmutrace.h" +#include <asm/kvm-mmutrace.h> #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d8a15a17d76..42342e8cc18 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,7 +34,7 @@ #include <asm/kvm_para.h> #include <asm/virtext.h> -#include "trace.h" +#include <asm/kvm-trace.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb8..d12b42e234b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ #include <asm/i387.h> #include <asm/xcr.h> -#include "trace.h" +#include <asm/kvm-trace.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcc0efce85b..6a8cb6fe5c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -47,7 +47,7 @@ #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS -#include "trace.h" +#include <asm/kvm-trace.h> #include <asm/debugreg.h> #include <asm/msr.h> diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0cc0..07f7a272226 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1276,6 +1276,7 @@ __init void lguest_init(void) pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; pv_cpu_ops.iret = lguest_iret; + pv_cpu_ops.nmi_return = lguest_iret; pv_cpu_ops.load_sp0 = lguest_load_sp0; pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 20e3f8702d1..abeb09914d5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -12,6 +12,7 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <trace/fault.h> #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -35,6 +36,11 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +DEFINE_TRACE(page_fault_entry); +DEFINE_TRACE(page_fault_exit); +DEFINE_TRACE(page_fault_nosem_entry); +DEFINE_TRACE(page_fault_nosem_exit); + /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: @@ -719,6 +725,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; + trace_page_fault_nosem_entry(regs, 14, address); if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -728,6 +735,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); + trace_page_fault_nosem_exit(); return; } @@ -1130,7 +1138,9 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ + trace_page_fault_entry(regs, 14, mm, vma, address, write); fault = handle_mm_fault(mm, vma, address, flags); + trace_page_fault_exit(fault); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8..14b9317eccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/cpu.h> +#include <trace/irq.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -141,6 +142,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &flush_state[sender]; + trace_irq_entry(sender, regs, NULL); + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) goto out; /* @@ -167,6 +170,7 @@ out: cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); smp_mb__after_clear_bit(); inc_irq_stat(irq_tlb_count); + trace_irq_exit(IRQ_HANDLED); } static void flush_tlb_others_ipi(const struct cpumask *cpumask, diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 127775696d6..99513642a0e 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/pm.h> +#include <linux/mfd/core.h> #include <asm/io.h> #include <asm/olpc.h> @@ -56,25 +57,24 @@ static void xo1_power_off(void) static int __devinit olpc_xo1_probe(struct platform_device *pdev) { struct resource *res; + int err; /* don't run on non-XOs */ if (!machine_is_olpc()) return -ENODEV; + err = mfd_cell_enable(pdev); + if (err) + return err; + res = platform_get_resource(pdev, IORESOURCE_IO, 0); if (!res) { dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - - if (!request_region(res->start, resource_size(res), DRV_NAME)) { - dev_err(&pdev->dev, "can't request region\n"); - return -EIO; - } - - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - struct resource *r; - - r = platform_get_resource(pdev, IORESOURCE_IO, 0); - release_region(r->start, resource_size(r)); + mfd_cell_disable(pdev); - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_drv = { .driver = { - .name = "cs5535-pms", + .name = "olpc-xo1-pms", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "cs5535-acpi", + .name = "olpc-xo1-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void) { int r; - r = platform_driver_register(&cs5535_pms_drv); + r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); if (r) return r; - r = platform_driver_register(&cs5535_acpi_drv); + r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, + "cs5535-acpi"); if (r) - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - platform_driver_unregister(&cs5535_acpi_drv); - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); +MODULE_ALIAS("platform:cs5535-pms"); module_init(olpc_xo1_init); module_exit(olpc_xo1_exit); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754cc3c..7bc481508d0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,8 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/trace-clock.h> +#include <asm/timer.h> #include "vextern.h" #define gtod vdso_vsyscall_gtod_data @@ -111,6 +113,46 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) return 0; } +/* + * If the TSC is synchronized across all CPUs, read the current TSC + * and export its value in the nsec field of the timespec + */ +notrace static noinline int do_trace_clock(struct timespec *ts) +{ + unsigned long seq; + union lttng_timespec *lts = (union lttng_timespec *) ts; + + do { + seq = read_seqbegin(>od->lock); + if (unlikely(!gtod->trace_clock_is_sync)) + return vdso_fallback_gettime(CLOCK_TRACE, ts); + /* + * We don't protect the rdtsc with the rdtsc_barrier because + * we can't obtain with tracing that level of precision. + * The operation of recording an event is not atomic therefore + * the small chance of imprecision doesn't justify the overhead + * of a barrier. + */ + /* + * TODO: check that vget_cycles(), using paravirt ops, will + * match the TSC read by get_cycles() at the kernel level. + */ + lts->lttng_ts = vget_cycles(); + } while (unlikely(read_seqretry(>od->lock, seq))); + + return 0; +} + +/* + * Returns the cpu_khz, it needs to be a syscall because we can't access + * this value from userspace and it will only be called at the beginning + * of the tracing session + */ +notrace static noinline int do_trace_clock_freq(struct timespec *ts) +{ + return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts); +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled)) @@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) return do_realtime_coarse(ts); case CLOCK_MONOTONIC_COARSE: return do_monotonic_coarse(ts); + case CLOCK_TRACE: + return do_trace_clock(ts); + case CLOCK_TRACE_FREQ: + return do_trace_clock_freq(ts); + default: + return -EINVAL; } return vdso_fallback_gettime(clock, ts); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45f..e3839c74ec4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -974,6 +974,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, + .nmi_return = xen_iret, .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, |