aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/ia32/ipc32.c5
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/idle.h17
-rw-r--r--arch/x86/include/asm/irqflags.h56
-rw-r--r--arch/x86/include/asm/kvm-mmutrace.h (renamed from arch/x86/kvm/mmutrace.h)4
-rw-r--r--arch/x86/include/asm/kvm-trace.h (renamed from arch/x86/kvm/trace.h)4
-rw-r--r--arch/x86/include/asm/paravirt.h4
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/trace-clock.h73
-rw-r--r--arch/x86/include/asm/tsc.h18
-rw-r--r--arch/x86/include/asm/vgtod.h1
-rw-r--r--arch/x86/include/asm/vsyscall.h8
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apm_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c3
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/entry_32.S30
-rw-r--r--arch/x86/kernel/entry_64.S45
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c6
-rw-r--r--arch/x86/kernel/process.c9
-rw-r--r--arch/x86/kernel/process_32.c43
-rw-r--r--arch/x86/kernel/process_64.c34
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/syscall_64.c18
-rw-r--r--arch/x86/kernel/trace-clock.c302
-rw-r--r--arch/x86/kernel/traps.c80
-rw-r--r--arch/x86/kernel/tsc_sync.c198
-rw-r--r--arch/x86/kernel/vsyscall_64.c14
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c42
-rw-r--r--arch/x86/vdso/vclock_gettime.c48
-rw-r--r--arch/x86/xen/enlighten.c1
48 files changed, 885 insertions, 289 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d30aa..b0389519b6d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,7 @@ config X86
select HAVE_READQ
select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_GET_CYCLES
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PERF_EVENTS
@@ -27,9 +28,11 @@ config X86
select HAVE_KPROBES
select HAVE_MEMBLOCK
select ARCH_WANT_OPTIONAL_GPIOLIB
+ select HAVE_LTT_DUMP_TABLES
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
+ select HAVE_TRACE_CLOCK
select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_C_RECORDMCOUNT
@@ -208,10 +211,12 @@ config HAVE_INTEL_TXT
config X86_32_SMP
def_bool y
depends on X86_32 && SMP
+ select HAVE_UNSYNCHRONIZED_TSC
config X86_64_SMP
def_bool y
depends on X86_64 && SMP
+ select HAVE_UNSYNCHRONIZED_TSC
config X86_HT
def_bool y
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index 29cdcd02ead..accd6b42bd2 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,8 +8,11 @@
#include <linux/shm.h>
#include <linux/ipc.h>
#include <linux/compat.h>
+#include <trace/ipc.h>
#include <asm/sys_ia32.h>
+DEFINE_TRACE(ipc_call);
+
asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
compat_uptr_t ptr, u32 fifth)
{
@@ -18,6 +21,8 @@ asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
version = call >> 16; /* hack for backward compatibility */
call &= 0xffff;
+ trace_ipc_call(call, first);
+
switch (call) {
case SEMOP:
/* struct sembuf is the same on 32 and 64bit :)) */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e95..d09bb03653f 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
{
int op = (encoded_op >> 28) & 7;
int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
oparg = 1 << oparg;
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
return ret;
}
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
- int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
{
+ int ret = 0;
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
/* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
return -ENOSYS;
#endif
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
- asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
+ asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
"2:\t.section .fixup, \"ax\"\n"
- "3:\tmov %2, %0\n"
+ "3:\tmov %3, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
- : "=a" (oldval), "+m" (*uaddr)
- : "i" (-EFAULT), "r" (newval), "0" (oldval)
+ : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "1" (oldval)
: "memory"
);
- return oldval;
+ *uval = oldval;
+ return ret;
}
#endif
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e27..9b1db108f9e 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,20 +1,9 @@
#ifndef _ASM_X86_IDLE_H
#define _ASM_X86_IDLE_H
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
-#ifdef CONFIG_X86_64
-void enter_idle(void);
-void exit_idle(void);
-#else /* !CONFIG_X86_64 */
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-#endif /* CONFIG_X86_64 */
+extern void enter_idle(void);
+extern void __exit_idle(void);
+extern void exit_idle(void);
void c1e_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..fdf897373e1 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -56,6 +56,61 @@ static inline void native_halt(void)
#endif
+#ifdef CONFIG_X86_64
+/*
+ * Only returns from a trap or exception to a NMI context (intra-privilege
+ * level near return) to the same SS and CS segments. Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued. It takes care of modifying the eflags, rsp and returning to the
+ * previous function.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(rsp) RIP
+ * 8(rsp) CS
+ * 16(rsp) EFLAGS
+ * 24(rsp) RSP
+ * 32(rsp) SS
+ *
+ * Upon execution :
+ * Copy EIP to the top of the return stack
+ * Update top of return stack address
+ * Pop eflags into the eflags register
+ * Make the return stack current
+ * Near return (popping the return address from the return stack)
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushq %rax; \
+ movq %rsp, %rax; \
+ movq 24+8(%rax), %rsp; \
+ pushq 0+8(%rax); \
+ pushq 16+8(%rax); \
+ movq (%rax), %rax; \
+ popfq; \
+ ret
+#else
+/*
+ * Protected mode only, no V8086. Implies that protected mode must
+ * be entered before NMIs or MCEs are enabled. Only returns from a trap or
+ * exception to a NMI context (intra-privilege level far return). Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(esp) EIP
+ * 4(esp) CS
+ * 8(esp) EFLAGS
+ *
+ * Upon execution :
+ * Copy the stack eflags to top of stack
+ * Pop eflags into the eflags register
+ * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \
+ popfl; \
+ lret $4
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -112,6 +167,7 @@ static inline unsigned long arch_local_irq_save(void)
#define ENABLE_INTERRUPTS(x) sti
#define DISABLE_INTERRUPTS(x) cli
+#define INTERRUPT_RETURN_NMI_SAFE NATIVE_INTERRUPT_RETURN_NMI_SAFE
#ifdef CONFIG_X86_64
#define SWAPGS swapgs
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/include/asm/kvm-mmutrace.h
index b60b4fdb3ed..42d117d0418 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/include/asm/kvm-mmutrace.h
@@ -217,9 +217,9 @@ TRACE_EVENT(
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_PATH asm
#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mmutrace
+#define TRACE_INCLUDE_FILE kvm-mmutrace
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/include/asm/kvm-trace.h
index 1357d7cf4ec..c1e151c092b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/include/asm/kvm-trace.h
@@ -701,9 +701,9 @@ TRACE_EVENT(kvm_emulate_insn,
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_PATH asm
#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
+#define TRACE_INCLUDE_FILE kvm-trace
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..1ef6906c179 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -962,6 +962,10 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
+#define INTERRUPT_RETURN_NMI_SAFE \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE, \
+ jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return)
+
#define DISABLE_INTERRUPTS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..3e0634cc127 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -181,6 +181,7 @@ struct pv_cpu_ops {
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
+ void (*nmi_return)(void);
void (*swapgs)(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a..58a37ae7565 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_KERNEL_TRACE 9 /* kernel trace active */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -105,6 +106,7 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_KERNEL_TRACE (1 << TIF_KERNEL_TRACE)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -121,18 +123,19 @@ struct thread_info {
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
- _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+ _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_KERNEL_TRACE)
/* work to do in syscall_trace_leave() */
#define _TIF_WORK_SYSCALL_EXIT \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
- _TIF_SYSCALL_TRACEPOINT)
+ _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
(0x0000FFFF & \
~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
- _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
+ _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU|_TIF_KERNEL_TRACE))
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
new file mode 100644
index 00000000000..8ca73323366
--- /dev/null
+++ b/arch/x86/include/asm/trace-clock.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_X86_TRACE_CLOCK_H
+#define _ASM_X86_TRACE_CLOCK_H
+
+/*
+ * linux/arch/x86/include/asm/trace-clock.h
+ *
+ * Copyright (C) 2005,2006,2008
+ * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Trace clock definitions for x86.
+ */
+
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/atomic.h>
+
+/* Minimum duration of a probe, in cycles */
+#define TRACE_CLOCK_MIN_PROBE_DURATION 200
+#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION
+
+union lttng_timespec {
+ struct timespec ts;
+ u64 lttng_ts;
+};
+
+extern cycles_t trace_clock_async_tsc_read(void);
+
+extern int _trace_clock_is_sync;
+static inline int trace_clock_is_sync(void)
+{
+ return _trace_clock_is_sync;
+}
+
+static inline u32 trace_clock_read32(void)
+{
+ u32 cycles;
+
+ if (likely(trace_clock_is_sync()))
+ cycles = (u32)get_cycles(); /* only need the 32 LSB */
+ else
+ cycles = (u32)trace_clock_async_tsc_read();
+ return cycles;
+}
+
+static inline u64 trace_clock_read64(void)
+{
+ u64 cycles;
+
+ if (likely(trace_clock_is_sync()))
+ cycles = get_cycles();
+ else
+ cycles = trace_clock_async_tsc_read();
+ return cycles;
+}
+
+static inline u64 trace_clock_frequency(void)
+{
+ return (u64)cpu_khz * 1000;
+}
+
+static inline u32 trace_clock_freq_scale(void)
+{
+ return 1;
+}
+
+extern int get_trace_clock(void);
+extern void put_trace_clock(void);
+
+extern void set_trace_clock_is_sync(int state);
+
+#endif /* _ASM_X86_TRACE_CLOCK_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d0..28e56e1ec3c 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,18 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);
+static inline cycles_t get_cycles_rate(void)
+{
+ if (check_tsc_unstable())
+ return 0;
+ return (cycles_t)tsc_khz * 1000;
+}
+
+static inline void get_cycles_barrier(void)
+{
+ rdtsc_barrier();
+}
+
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
@@ -62,4 +74,10 @@ extern int notsc_setup(char *);
extern void save_sched_clock_state(void);
extern void restore_sched_clock_state(void);
+extern int test_tsc_synchronization(void);
+extern int _tsc_is_sync;
+static inline int tsc_is_sync(void)
+{
+ return _tsc_is_sync;
+}
#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826..06abe8f409a 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
u32 wall_time_nsec;
int sysctl_enabled;
+ int trace_clock_is_sync;
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
cycle_t (*vread)(void);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fb..47b80f3ba4d 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -39,6 +39,14 @@ extern struct timezone sys_tz;
extern void map_vsyscall(void);
+#ifdef CONFIG_X86_64
+extern void update_trace_clock_is_sync_vdso(void);
+#else
+static inline void update_trace_clock_is_sync_vdso(void)
+{
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd88..717cf9c620b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,6 +46,7 @@ obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-$(CONFIG_HAVE_TRACE_CLOCK) += trace-clock.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
@@ -66,9 +67,8 @@ obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP) += smpboot.o
obj-$(CONFIG_SMP) += setup_percpu.o
-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d74978..c604d23b4f3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -33,6 +33,7 @@
#include <linux/dmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <trace/irq.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
@@ -868,7 +869,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
*/
exit_idle();
irq_enter();
+ trace_irq_entry(LOCAL_TIMER_VECTOR, regs, NULL);
local_apic_timer_interrupt();
+ trace_irq_exit(IRQ_HANDLED);
irq_exit();
set_irq_regs(old_regs);
@@ -1788,6 +1791,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
exit_idle();
irq_enter();
+ trace_irq_entry(SPURIOUS_APIC_VECTOR, NULL, NULL);
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1802,6 +1806,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt on CPU#%d, "
"should never happen.\n", smp_processor_id());
+ trace_irq_exit(IRQ_HANDLED);
irq_exit();
}
@@ -1814,6 +1819,7 @@ void smp_error_interrupt(struct pt_regs *regs)
exit_idle();
irq_enter();
+ trace_irq_entry(ERROR_APIC_VECTOR, NULL, NULL);
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
apic_write(APIC_ESR, 0);
@@ -1834,6 +1840,7 @@ void smp_error_interrupt(struct pt_regs *regs)
*/
pr_debug("APIC error on CPU%d: %02x(%02x)\n",
smp_processor_id(), v , v1);
+ trace_irq_exit(IRQ_HANDLED);
irq_exit();
}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a74..60939d5f226 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
+#include <linux/idle.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -235,6 +236,7 @@
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
+#include <asm/idle.h>
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
@@ -947,10 +949,17 @@ recalc:
break;
}
}
+ enter_idle();
if (original_pm_idle)
original_pm_idle();
else
default_idle();
+ /*
+ * In many cases the interrupt that ended idle
+ * has already called exit_idle. But some idle
+ * loops can be woken up without interrupt.
+ */
+ __exit_idle();
local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37..677f8475d9d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,6 +111,7 @@ void foo(void)
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd96..1aea11cd840 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -58,6 +58,7 @@ int main(void)
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396b..6052f6f65a6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1069,6 +1069,7 @@ unsigned long kernel_eflags;
* debugging, no special alignment required.
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
+EXPORT_PER_CPU_SYMBOL_GPL(orig_ist);
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..c8a6411d8ba 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -23,6 +23,7 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <trace/irq.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -402,8 +403,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
{
exit_idle();
irq_enter();
+ trace_irq_entry(THERMAL_APIC_VECTOR, regs, NULL);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
+ trace_irq_exit(IRQ_HANDLED);
irq_exit();
/* Ack only at the end to avoid potential reentry */
ack_APIC_irq();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1..6bed23e1c74 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -15,6 +15,7 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <linux/ltt-core.h>
#include <asm/stacktrace.h>
@@ -253,6 +254,8 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
if (!signr)
return;
+ if (in_nmi())
+ panic("Fatal exception in non-maskable interrupt");
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
@@ -277,6 +280,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
+#ifdef CONFIG_LTT
+ printk(KERN_EMERG "LTT NESTING LEVEL : %u", __get_cpu_var(ltt_nesting));
+ printk("\n");
+#endif
sysfs_printk_last_file();
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9ca3b0e343e..afd6d8ef78c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -80,6 +80,8 @@
#define nr_syscalls ((syscall_table_size)/4)
+#define NMI_MASK 0x04000000
+
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
@@ -321,8 +323,32 @@ END(ret_from_fork)
# userspace resumption stub bypassing syscall exit tracing
ALIGN
RING0_PTREGS_FRAME
+
ret_from_exception:
preempt_stop(CLBR_ANY)
+ GET_THREAD_INFO(%ebp)
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+ cmpl $USER_RPL, %eax
+ jae resume_userspace # returning to v8086 or userspace
+ testl $NMI_MASK,TI_preempt_count(%ebp)
+ jz resume_kernel /* Not nested over NMI ? */
+ testw $X86_EFLAGS_TF, PT_EFLAGS(%esp)
+ jnz resume_kernel /*
+ * If single-stepping an NMI handler,
+ * use the normal iret path instead of
+ * the popf/lret because lret would be
+ * single-stepped. It should not
+ * happen : it will reactivate NMIs
+ * prematurely.
+ */
+ TRACE_IRQS_IRET
+ RESTORE_REGS
+ addl $4, %esp # skip orig_eax/error_code
+ CFI_ADJUST_CFA_OFFSET -4
+ INTERRUPT_RETURN_NMI_SAFE
+
ret_from_intr:
GET_THREAD_INFO(%ebp)
check_userspace:
@@ -906,6 +932,10 @@ ENTRY(native_iret)
.previous
END(native_iret)
+ENTRY(native_nmi_return)
+ NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ?
+END(native_nmi_return)
+
ENTRY(native_irq_enable_sysexit)
sti
sysexit
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bbd5c80cb09..7800ff65aab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -163,6 +163,8 @@ GLOBAL(return_to_handler)
#endif
+#define NMI_MASK 0x04000000
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
@@ -515,6 +517,8 @@ sysret_check:
/* Handle reschedules */
/* edx: work, edi: workmask */
sysret_careful:
+ testl $_TIF_KERNEL_TRACE,%edx /* Re-read : concurrently changed */
+ jnz ret_from_sys_call_trace
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
TRACE_IRQS_ON
@@ -524,6 +528,16 @@ sysret_careful:
popq_cfi %rdi
jmp sysret_check
+ret_from_sys_call_trace:
+ TRACE_IRQS_ON
+ sti
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %rdi
+ movq %rsp,%rdi
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+
/* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
@@ -872,6 +886,9 @@ ENTRY(native_iret)
.section __ex_table,"a"
.quad native_iret, bad_iret
.previous
+
+ENTRY(native_nmi_return)
+ NATIVE_INTERRUPT_RETURN_NMI_SAFE
#endif
.section .fixup,"ax"
@@ -924,6 +941,24 @@ retint_signal:
GET_THREAD_INFO(%rcx)
jmp retint_with_reschedule
+ /* Returning to kernel space from exception. */
+ /* rcx: threadinfo. interrupts off. */
+ENTRY(retexc_kernel)
+ testl $NMI_MASK,TI_preempt_count(%rcx)
+ jz retint_kernel /* Not nested over NMI ? */
+ testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */
+ jnz retint_kernel /*
+ * If single-stepping an NMI handler,
+ * use the normal iret path instead of
+ * the popf/lret because lret would be
+ * single-stepped. It should not
+ * happen : it will reactivate NMIs
+ * prematurely.
+ */
+ RESTORE_ARGS 0,8,0
+ TRACE_IRQS_IRETQ
+ INTERRUPT_RETURN_NMI_SAFE
+
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
@@ -1361,12 +1396,18 @@ ENTRY(paranoid_exit)
paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
+paranoid_restore_no_nmi:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
+ GET_THREAD_INFO(%rcx)
TRACE_IRQS_IRETQ 0
+ testl $NMI_MASK,TI_preempt_count(%rcx)
+ jz paranoid_restore_no_nmi /* Nested over NMI ? */
+ testw $X86_EFLAGS_TF,EFLAGS-0(%rsp) /* trap flag? */
+ jnz paranoid_restore_no_nmi
RESTORE_ALL 8
- jmp irq_return
+ INTERRUPT_RETURN_NMI_SAFE
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
@@ -1465,7 +1506,7 @@ ENTRY(error_exit)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
testl %eax,%eax
- jne retint_kernel
+ jne retexc_kernel
LOCKDEP_SYS_EXIT_IRQ
movl TI_flags(%rcx),%edx
movl $_TIF_WORK_MASK,%edi
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..1fc5da98373 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -156,6 +156,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_64(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) ||
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
@@ -204,6 +205,7 @@ static void native_flush_tlb_single(unsigned long addr)
/* These are in entry.S */
extern void native_iret(void);
+extern void native_nmi_return(void);
extern void native_irq_enable_sysexit(void);
extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
@@ -373,6 +375,7 @@ struct pv_cpu_ops pv_cpu_ops = {
.usergs_sysret64 = native_usergs_sysret64,
#endif
.iret = native_iret,
+ .nmi_return = native_nmi_return,
.swapgs = native_swapgs,
.set_iopl_mask = native_set_iopl_mask,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab..ac372778bbc 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -1,10 +1,13 @@
-#include <asm/paravirt.h>
+#include <linux/stringify.h>
+#include <linux/irqflags.h>
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+ __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
@@ -41,6 +44,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, restore_fl);
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_cpu_ops, iret);
+ PATCH_SITE(pv_cpu_ops, nmi_return);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93e..5339e67dc15 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -1,12 +1,15 @@
+#include <linux/irqflags.h>
+#include <linux/stringify.h>
#include <asm/paravirt.h>
#include <asm/asm-offsets.h>
-#include <linux/stringify.h>
DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_cpu_ops, iret, "iretq");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+ __stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -51,6 +54,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_cpu_ops, iret);
+ PATCH_SITE(pv_cpu_ops, nmi_return);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff455419898..e0e4ffcad48 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -13,6 +13,7 @@
#include <linux/dmi.h>
#include <linux/utsname.h>
#include <trace/events/power.h>
+#include <trace/sched.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
#include <asm/system.h>
@@ -23,6 +24,8 @@
#include <asm/i387.h>
#include <asm/debugreg.h>
+DEFINE_TRACE(sched_kthread_create);
+
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -278,6 +281,7 @@ extern void kernel_thread_helper(void);
int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct pt_regs regs;
+ long pid;
memset(&regs, 0, sizeof(regs));
@@ -299,7 +303,10 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.flags = X86_EFLAGS_IF | 0x2;
/* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+ pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED,
+ 0, &regs, 0, NULL, NULL);
+ trace_sched_kthread_create(fn, pid);
+ return pid;
}
EXPORT_SYMBOL(kernel_thread);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..256ba23211d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,9 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/idle.h>
+#include <trace/pm.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -59,6 +62,38 @@
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+DEFINE_TRACE(pm_idle_exit);
+DEFINE_TRACE(pm_idle_entry);
+
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
+void enter_idle(void)
+{
+ percpu_write(is_idle, 1);
+ trace_pm_idle_entry();
+ notify_idle(IDLE_START);
+}
+EXPORT_SYMBOL_GPL(enter_idle);
+
+void __exit_idle(void)
+{
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+ return;
+ notify_idle(IDLE_END);
+ trace_pm_idle_exit();
+}
+EXPORT_SYMBOL_GPL(__exit_idle);
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+ /* idle loop has pid 0 */
+ if (current->pid)
+ return;
+ __exit_idle();
+}
+EXPORT_SYMBOL_GPL(exit_idle);
+
/*
* Return saved PC of a blocked thread.
*/
@@ -107,10 +142,18 @@ void cpu_idle(void)
play_dead();
local_irq_disable();
+ enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ /*
+ * In many cases the interrupt that ended idle
+ * has already called exit_idle. But some idle
+ * loops can be woken up without interrupt.
+ */
+ __exit_idle();
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b..fbde94f1447 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -35,8 +35,10 @@
#include <linux/tick.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
+#include <linux/idle.h>
#include <linux/io.h>
#include <linux/ftrace.h>
+#include <trace/pm.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -51,37 +53,34 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+DEFINE_TRACE(pm_idle_exit);
+DEFINE_TRACE(pm_idle_entry);
+
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-
void enter_idle(void)
{
percpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ /*
+ * Trace last event before calling notifiers. Notifiers flush
+ * data from buffers before going to idle.
+ */
+ trace_pm_idle_entry();
+ notify_idle(IDLE_START);
}
+EXPORT_SYMBOL_GPL(enter_idle);
-static void __exit_idle(void)
+void __exit_idle(void)
{
if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ notify_idle(IDLE_END);
+ trace_pm_idle_exit();
}
+EXPORT_SYMBOL_GPL(__exit_idle);
/* Called from interrupts to signify idle end */
void exit_idle(void)
@@ -91,6 +90,7 @@ void exit_idle(void)
return;
__exit_idle();
}
+EXPORT_SYMBOL_GPL(exit_idle);
#ifndef CONFIG_SMP
static inline void play_dead(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72..ee3024d4f61 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <trace/syscall.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -152,6 +153,9 @@ static const int arg_offs_table[] = {
X86_EFLAGS_DF | X86_EFLAGS_OF | \
X86_EFLAGS_RF | X86_EFLAGS_AC))
+DEFINE_TRACE(syscall_entry);
+DEFINE_TRACE(syscall_exit);
+
/*
* Determines whether a value may be installed in a segment register.
*/
@@ -1361,6 +1365,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (test_thread_flag(TIF_SINGLESTEP))
regs->flags |= X86_EFLAGS_TF;
+ trace_syscall_entry(regs, regs->orig_ax);
+
/* do the secure computing check first */
secure_computing(regs->orig_ax);
@@ -1396,6 +1402,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
+ trace_syscall_exit(regs->ax);
+
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d600829..5e74f6aa3c0 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -1,8 +1,11 @@
/* System call table for x86-64. */
#include <linux/linkage.h>
+#include <linux/module.h>
#include <linux/sys.h>
#include <linux/cache.h>
+#include <linux/marker.h>
+#include <linux/kallsyms.h>
#include <asm/asm-offsets.h>
#define __NO_STUBS
@@ -27,3 +30,18 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/unistd_64.h>
};
+
+void ltt_dump_sys_call_table(void *call_data)
+{
+ int i;
+ char namebuf[KSYM_NAME_LEN];
+
+ for (i = 0; i < __NR_syscall_max + 1; i++) {
+ sprint_symbol(namebuf, (unsigned long)sys_call_table[i]);
+ __trace_mark(0, syscall_state, sys_call_table,
+ call_data,
+ "id %d address %p symbol %s",
+ i, (void*)sys_call_table[i], namebuf);
+ }
+}
+EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table);
diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c
new file mode 100644
index 00000000000..47539e28276
--- /dev/null
+++ b/arch/x86/kernel/trace-clock.c
@@ -0,0 +1,302 @@
+/*
+ * arch/x86/kernel/trace-clock.c
+ *
+ * Trace clock for x86.
+ *
+ * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>, October 2008
+ */
+
+#include <linux/module.h>
+#include <linux/trace-clock.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/cpu.h>
+#include <linux/posix-timers.h>
+#include <asm/vgtod.h>
+
+static cycles_t trace_clock_last_tsc;
+static DEFINE_PER_CPU(struct timer_list, update_timer);
+static DEFINE_SPINLOCK(async_tsc_lock);
+static int async_tsc_refcount; /* Number of readers */
+static int async_tsc_enabled; /* Async TSC enabled on all online CPUs */
+
+int _trace_clock_is_sync = 1;
+EXPORT_SYMBOL_GPL(_trace_clock_is_sync);
+
+/*
+ * Is the trace clock being used by user-space ? We leave the trace clock active
+ * as soon as user-space starts using it. We never unref the trace clock
+ * reference taken by user-space.
+ */
+static atomic_t user_trace_clock_ref;
+
+/*
+ * Called by check_tsc_sync_source from CPU hotplug.
+ */
+void set_trace_clock_is_sync(int state)
+{
+ _trace_clock_is_sync = state;
+ update_trace_clock_is_sync_vdso();
+}
+
+#if BITS_PER_LONG == 64
+static cycles_t read_last_tsc(void)
+{
+ return trace_clock_last_tsc;
+}
+#else
+/*
+ * A cmpxchg64 update can happen concurrently. Based on the assumption that
+ * two cmpxchg64 will never update it to the same value (the count always
+ * increases), reading it twice insures that we read a coherent value with the
+ * same "sequence number".
+ */
+static cycles_t read_last_tsc(void)
+{
+ cycles_t val1, val2;
+
+ val1 = trace_clock_last_tsc;
+ for (;;) {
+ val2 = val1;
+ barrier();
+ val1 = trace_clock_last_tsc;
+ if (likely(val1 == val2))
+ break;
+ }
+ return val1;
+}
+#endif
+
+/*
+ * Support for architectures with non-sync TSCs.
+ * When the local TSC is discovered to lag behind the highest TSC counter, we
+ * increment the TSC count of an amount that should be, ideally, lower than the
+ * execution time of this routine, in cycles : this is the granularity we look
+ * for : we must be able to order the events.
+ */
+notrace cycles_t trace_clock_async_tsc_read(void)
+{
+ cycles_t new_tsc, last_tsc;
+
+ WARN_ON(!async_tsc_refcount || !async_tsc_enabled);
+ new_tsc = get_cycles();
+ last_tsc = read_last_tsc();
+ do {
+ if (new_tsc < last_tsc)
+ new_tsc = last_tsc + TRACE_CLOCK_MIN_PROBE_DURATION;
+ /*
+ * If cmpxchg fails with a value higher than the new_tsc, don't
+ * retry : the value has been incremented and the events
+ * happened almost at the same time.
+ * We must retry if cmpxchg fails with a lower value :
+ * it means that we are the CPU with highest frequency and
+ * therefore MUST update the value.
+ */
+ last_tsc = cmpxchg64(&trace_clock_last_tsc, last_tsc, new_tsc);
+ } while (unlikely(last_tsc < new_tsc));
+ return new_tsc;
+}
+EXPORT_SYMBOL_GPL(trace_clock_async_tsc_read);
+
+static void update_timer_ipi(void *info)
+{
+ (void)trace_clock_async_tsc_read();
+}
+
+/*
+ * update_timer_fct : - Timer function to resync the clocks
+ * @data: unused
+ *
+ * Fires every jiffy.
+ */
+static void update_timer_fct(unsigned long data)
+{
+ (void)trace_clock_async_tsc_read();
+ mod_timer_pinned(&per_cpu(update_timer, smp_processor_id()),
+ jiffies + 1);
+}
+
+static void enable_trace_clock(int cpu)
+{
+ init_timer(&per_cpu(update_timer, cpu));
+ per_cpu(update_timer, cpu).function = update_timer_fct;
+ per_cpu(update_timer, cpu).expires = jiffies + 1;
+ smp_call_function_single(cpu, update_timer_ipi, NULL, 1);
+ add_timer_on(&per_cpu(update_timer, cpu), cpu);
+}
+
+static void disable_trace_clock(int cpu)
+{
+ del_timer_sync(&per_cpu(update_timer, cpu));
+}
+
+/*
+ * hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ int cpu;
+
+ spin_lock(&async_tsc_lock);
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ /*
+ * trace_clock_is_sync() is updated by set_trace_clock_is_sync()
+ * code, protected by cpu hotplug disable.
+ * It is ok to let the hotplugged CPU read the timebase before
+ * the CPU_ONLINE notification. It's just there to give a
+ * maximum bound to the TSC error.
+ */
+ if (async_tsc_refcount && !trace_clock_is_sync()) {
+ if (!async_tsc_enabled) {
+ async_tsc_enabled = 1;
+ for_each_online_cpu(cpu)
+ enable_trace_clock(cpu);
+ } else {
+ enable_trace_clock(hotcpu);
+ }
+ }
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!async_tsc_refcount && num_online_cpus() == 1)
+ set_trace_clock_is_sync(1);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /*
+ * We cannot stop the trace clock on other CPUs when readers are
+ * active even if we go back to a synchronized state (1 CPU)
+ * because the CPU left could be the one lagging behind.
+ */
+ if (async_tsc_refcount && async_tsc_enabled)
+ disable_trace_clock(hotcpu);
+ if (!async_tsc_refcount && num_online_cpus() == 1)
+ set_trace_clock_is_sync(1);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ spin_unlock(&async_tsc_lock);
+
+ return NOTIFY_OK;
+}
+
+int get_trace_clock(void)
+{
+ int cpu;
+
+ if (!trace_clock_is_sync()) {
+ printk(KERN_WARNING
+ "Trace clock falls back on cache-line bouncing\n"
+ "workaround due to non-synchronized TSCs.\n"
+ "This workaround preserves event order across CPUs.\n"
+ "Please consider disabling Speedstep or PowerNow and\n"
+ "using kernel parameters "
+ "\"force_tsc_sync=1 idle=poll\"\n"
+ "for accurate and fast tracing clock source.\n");
+ }
+
+ get_online_cpus();
+ spin_lock(&async_tsc_lock);
+ if (async_tsc_refcount++ || trace_clock_is_sync())
+ goto end;
+
+ async_tsc_enabled = 1;
+ for_each_online_cpu(cpu)
+ enable_trace_clock(cpu);
+end:
+ spin_unlock(&async_tsc_lock);
+ put_online_cpus();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_trace_clock);
+
+void put_trace_clock(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ spin_lock(&async_tsc_lock);
+ WARN_ON(async_tsc_refcount <= 0);
+ if (async_tsc_refcount != 1 || !async_tsc_enabled)
+ goto end;
+
+ for_each_online_cpu(cpu)
+ disable_trace_clock(cpu);
+ async_tsc_enabled = 0;
+end:
+ async_tsc_refcount--;
+ if (!async_tsc_refcount && num_online_cpus() == 1)
+ set_trace_clock_is_sync(1);
+ spin_unlock(&async_tsc_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(put_trace_clock);
+
+static int posix_get_trace(clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+ int ret;
+
+ /*
+ * Yes, there is a race here that would lead to refcount being
+ * incremented more than once, but all we care is to leave the trace
+ * clock active forever, so precise accounting is not needed.
+ */
+ if (unlikely(!atomic_read(&user_trace_clock_ref))) {
+ ret = get_trace_clock();
+ if (ret)
+ return ret;
+ atomic_inc(&user_trace_clock_ref);
+ }
+ lts->lttng_ts = trace_clock_read64();
+ return 0;
+}
+
+static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+ lts->lttng_ts = trace_clock_frequency();
+ return 0;
+}
+
+static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+ lts->lttng_ts = TRACE_CLOCK_RES;
+ return 0;
+}
+
+static __init int init_unsync_trace_clock(void)
+{
+ struct k_clock clock_trace = {
+ .clock_getres = posix_get_trace_res,
+ .clock_get = posix_get_trace,
+ };
+ struct k_clock clock_trace_freq = {
+ .clock_getres = posix_get_trace_res,
+ .clock_get = posix_get_trace_freq,
+ };
+
+ register_posix_clock(CLOCK_TRACE, &clock_trace);
+ register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq);
+
+ hotcpu_notifier(hotcpu_callback, 4);
+ return 0;
+}
+early_initcall(init_unsync_trace_clock);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..d45030679f9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -31,6 +31,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <trace/trap.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -52,6 +53,7 @@
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/traps.h>
+#include <asm/unistd.h>
#include <asm/desc.h>
#include <asm/i387.h>
#include <asm/mce.h>
@@ -76,11 +78,21 @@ char ignore_fpu_irq;
* F0 0F bug workaround.
*/
gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
+
+extern unsigned long sys_call_table[];
+extern unsigned long syscall_table_size;
+
#endif
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);
+/*
+ * Also used in arch/x86/mm/fault.c.
+ */
+DEFINE_TRACE(trap_entry);
+DEFINE_TRACE(trap_exit);
+
static int ignore_nmis;
int unknown_nmi_panic;
@@ -122,6 +134,8 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
{
struct task_struct *tsk = current;
+ trace_trap_entry(regs, trapnr);
+
#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK) {
/*
@@ -168,7 +182,7 @@ trap_signal:
force_sig_info(signr, info, tsk);
else
force_sig(signr, tsk);
- return;
+ goto end;
kernel_trap:
if (!fixup_exception(regs)) {
@@ -176,15 +190,17 @@ kernel_trap:
tsk->thread.trap_no = trapnr;
die(str, regs, error_code);
}
- return;
+ goto end;
#ifdef CONFIG_X86_32
vm86_trap:
if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, trapnr))
goto trap_signal;
- return;
+ goto end;
#endif
+end:
+ trace_trap_exit();
}
#define DO_ERROR(trapnr, signr, str, name) \
@@ -285,7 +301,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
printk("\n");
}
+ trace_trap_entry(regs, 13);
force_sig(SIGSEGV, tsk);
+ trace_trap_exit();
return;
#ifdef CONFIG_X86_32
@@ -371,9 +389,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
static notrace __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
+ trace_trap_entry(regs, 2);
+
if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
NOTIFY_STOP)
- return;
+ goto end;
#ifdef CONFIG_MCA
/*
* Might actually be able to figure out what the guilty party
@@ -381,7 +401,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
*/
if (MCA_bus) {
mca_handle_nmi();
- return;
+ goto end;
}
#endif
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
@@ -392,19 +412,23 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
panic("NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
+end:
+ trace_trap_exit();
}
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ trace_trap_entry(regs, 2);
+
/*
* CPU-specific NMI must be processed before non-CPU-specific
* NMI, otherwise we may lose it, because the CPU-specific
* NMI can not be detected/processed on other CPUs.
*/
if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
+ goto end;
/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
raw_spin_lock(&nmi_reason_lock);
@@ -423,11 +447,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
reassert_nmi();
#endif
raw_spin_unlock(&nmi_reason_lock);
- return;
+ goto end;
}
raw_spin_unlock(&nmi_reason_lock);
unknown_nmi_error(reason, regs);
+end:
+ trace_trap_exit();
}
dotraplinkage notrace __kprobes void
@@ -570,8 +596,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_sti(regs);
if (regs->flags & X86_VM_MASK) {
+ trace_trap_entry(regs, 1);
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
+ trace_trap_exit();
preempt_conditional_cli(regs);
return;
}
@@ -589,13 +617,32 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
regs->flags &= ~X86_EFLAGS_TF;
}
si_code = get_si_code(tsk->thread.debugreg6);
- if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) {
+ trace_trap_entry(regs, 1);
send_sigtrap(tsk, regs, error_code, si_code);
+ trace_trap_exit();
+ }
preempt_conditional_cli(regs);
return;
}
+#ifdef CONFIG_X86_32
+void ltt_dump_sys_call_table(void *call_data)
+{
+ int i;
+ char namebuf[KSYM_NAME_LEN];
+
+ for (i = 0; i < NR_syscalls; i++) {
+ sprint_symbol(namebuf, sys_call_table[i]);
+ __trace_mark(0, syscall_state, sys_call_table, call_data,
+ "id %d address %p symbol %s",
+ i, (void*)sys_call_table[i], namebuf);
+ }
+}
+EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table);
+#endif
+
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
@@ -701,11 +748,13 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
+ trace_trap_entry(regs, 16);
conditional_sti(regs);
#if 0
/* No need to warn about this any longer. */
printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
#endif
+ trace_trap_exit();
}
asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
@@ -738,6 +787,21 @@ void __math_state_restore(void)
tsk->fpu_counter++;
}
+void ltt_dump_idt_table(void *call_data)
+{
+ int i;
+ char namebuf[KSYM_NAME_LEN];
+
+ for (i = 0; i < IDT_ENTRIES; i++) {
+ unsigned long address = gate_offset(idt_table[i]);
+ sprint_symbol(namebuf, address);
+ __trace_mark(0, irq_state, idt_table, call_data,
+ "irq %d address %p symbol %s",
+ i, (void *)address, namebuf);
+ }
+}
+EXPORT_SYMBOL_GPL(ltt_dump_idt_table);
+
/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
deleted file mode 100644
index 0aa5fed8b9e..00000000000
--- a/arch/x86/kernel/tsc_sync.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * check TSC synchronization.
- *
- * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
- *
- * We check whether all boot CPUs have their TSC's synchronized,
- * print a warning if not and turn off the TSC clock-source.
- *
- * The warp-check is point-to-point between two CPUs, the CPU
- * initiating the bootup is the 'source CPU', the freshly booting
- * CPU is the 'target CPU'.
- *
- * Only two CPUs may participate - they can enter in any order.
- * ( The serial nature of the boot logic and the CPU hotplug lock
- * protects against more than 2 CPUs entering this code. )
- */
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/nmi.h>
-#include <asm/tsc.h>
-
-/*
- * Entry/exit counters that make sure that both CPUs
- * run the measurement code at once:
- */
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
-
-/*
- * We use a raw spinlock in this exceptional case, because
- * we want to have the fastest, inlined, non-debug version
- * of a critical section, to be able to prove TSC time-warps:
- */
-static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
-
-/*
- * TSC-warp measurement loop running on both CPUs:
- */
-static __cpuinit void check_tsc_warp(void)
-{
- cycles_t start, now, prev, end;
- int i;
-
- rdtsc_barrier();
- start = get_cycles();
- rdtsc_barrier();
- /*
- * The measurement runs for 20 msecs:
- */
- end = start + tsc_khz * 20ULL;
- now = start;
-
- for (i = 0; ; i++) {
- /*
- * We take the global lock, measure TSC, save the
- * previous TSC that was measured (possibly on
- * another CPU) and update the previous TSC timestamp.
- */
- arch_spin_lock(&sync_lock);
- prev = last_tsc;
- rdtsc_barrier();
- now = get_cycles();
- rdtsc_barrier();
- last_tsc = now;
- arch_spin_unlock(&sync_lock);
-
- /*
- * Be nice every now and then (and also check whether
- * measurement is done [we also insert a 10 million
- * loops safety exit, so we dont lock up in case the
- * TSC readout is totally broken]):
- */
- if (unlikely(!(i & 7))) {
- if (now > end || i > 10000000)
- break;
- cpu_relax();
- touch_nmi_watchdog();
- }
- /*
- * Outside the critical section we can now see whether
- * we saw a time-warp of the TSC going backwards:
- */
- if (unlikely(prev > now)) {
- arch_spin_lock(&sync_lock);
- max_warp = max(max_warp, prev - now);
- nr_warps++;
- arch_spin_unlock(&sync_lock);
- }
- }
- WARN(!(now-start),
- "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
- now-start, end-start);
-}
-
-/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
- */
-void __cpuinit check_tsc_sync_source(int cpu)
-{
- int cpus = 2;
-
- /*
- * No need to check if we already know that the TSC is not
- * synchronized:
- */
- if (unsynchronized_tsc())
- return;
-
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
- pr_info(
- "Skipped synchronization checks as TSC is reliable.\n");
- return;
- }
-
- /*
- * Reset it - in case this is a second bootup:
- */
- atomic_set(&stop_count, 0);
-
- /*
- * Wait for the target to arrive:
- */
- while (atomic_read(&start_count) != cpus-1)
- cpu_relax();
- /*
- * Trigger the target to continue into the measurement too:
- */
- atomic_inc(&start_count);
-
- check_tsc_warp();
-
- while (atomic_read(&stop_count) != cpus-1)
- cpu_relax();
-
- if (nr_warps) {
- pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
- smp_processor_id(), cpu);
- pr_warning("Measured %Ld cycles TSC warp between CPUs, "
- "turning off TSC clock.\n", max_warp);
- mark_tsc_unstable("check_tsc_sync_source failed");
- } else {
- pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
- smp_processor_id(), cpu);
- }
-
- /*
- * Reset it - just in case we boot another CPU later:
- */
- atomic_set(&start_count, 0);
- nr_warps = 0;
- max_warp = 0;
- last_tsc = 0;
-
- /*
- * Let the target continue with the bootup:
- */
- atomic_inc(&stop_count);
-}
-
-/*
- * Freshly booted CPUs call into this:
- */
-void __cpuinit check_tsc_sync_target(void)
-{
- int cpus = 2;
-
- if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
- return;
-
- /*
- * Register this CPU's participation and wait for the
- * source CPU to start the measurement:
- */
- atomic_inc(&start_count);
- while (atomic_read(&start_count) != cpus)
- cpu_relax();
-
- check_tsc_warp();
-
- /*
- * Ok, we are done:
- */
- atomic_inc(&stop_count);
-
- /*
- * Wait for the source CPU to print stuff:
- */
- while (atomic_read(&stop_count) != cpus)
- cpu_relax();
-}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..df18f14c473 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
#define __vsyscall(nr) \
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
+ .trace_clock_is_sync = 1,
};
void update_vsyscall_tz(void)
@@ -73,6 +76,16 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
+void update_trace_clock_is_sync_vdso(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
+ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso);
+
void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
struct clocksource *clock, u32 mult)
{
@@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = *wtm;
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac..f894af174b8 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,7 +32,7 @@
#include "irq.h"
#include <linux/kvm_host.h>
-#include "trace.h"
+#include <asm/kvm-trace.h>
static void pic_irq_request(struct kvm *kvm, int level);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d365..58bcbce5b02 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -36,7 +36,7 @@
#include <asm/atomic.h>
#include "kvm_cache_regs.h"
#include "irq.h"
-#include "trace.h"
+#include <asm/kvm-trace.h>
#include "x86.h"
#ifndef CONFIG_X86_64
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d4..3612044ed1f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -163,7 +163,7 @@ module_param(oos_shadow, bool, 0644);
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
+#include <asm/kvm-mmutrace.h>
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d8a15a17d76..42342e8cc18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,7 +34,7 @@
#include <asm/kvm_para.h>
#include <asm/virtext.h>
-#include "trace.h"
+#include <asm/kvm-trace.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb8..d12b42e234b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,7 +40,7 @@
#include <asm/i387.h>
#include <asm/xcr.h>
-#include "trace.h"
+#include <asm/kvm-trace.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85b..6a8cb6fe5c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -47,7 +47,7 @@
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
-#include "trace.h"
+#include <asm/kvm-trace.h>
#include <asm/debugreg.h>
#include <asm/msr.h>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0cc0..07f7a272226 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1276,6 +1276,7 @@ __init void lguest_init(void)
pv_cpu_ops.cpuid = lguest_cpuid;
pv_cpu_ops.load_idt = lguest_load_idt;
pv_cpu_ops.iret = lguest_iret;
+ pv_cpu_ops.nmi_return = lguest_iret;
pv_cpu_ops.load_sp0 = lguest_load_sp0;
pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
pv_cpu_ops.set_ldt = lguest_set_ldt;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1..abeb09914d5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <trace/fault.h>
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -35,6 +36,11 @@ enum x86_pf_error_code {
PF_INSTR = 1 << 4,
};
+DEFINE_TRACE(page_fault_entry);
+DEFINE_TRACE(page_fault_exit);
+DEFINE_TRACE(page_fault_nosem_entry);
+DEFINE_TRACE(page_fault_nosem_exit);
+
/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
@@ -719,6 +725,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
+ trace_page_fault_nosem_entry(regs, 14, address);
if (unlikely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
@@ -728,6 +735,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+ trace_page_fault_nosem_exit();
return;
}
@@ -1130,7 +1138,9 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
+ trace_page_fault_entry(regs, 14, mm, vma, address, write);
fault = handle_mm_fault(mm, vma, address, flags);
+ trace_page_fault_exit(fault);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8..14b9317eccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <trace/irq.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -141,6 +142,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
f = &flush_state[sender];
+ trace_irq_entry(sender, regs, NULL);
+
if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
goto out;
/*
@@ -167,6 +170,7 @@ out:
cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
smp_mb__after_clear_bit();
inc_irq_stat(irq_tlb_count);
+ trace_irq_exit(IRQ_HANDLED);
}
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 127775696d6..99513642a0e 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
+#include <linux/mfd/core.h>
#include <asm/io.h>
#include <asm/olpc.h>
@@ -56,25 +57,24 @@ static void xo1_power_off(void)
static int __devinit olpc_xo1_probe(struct platform_device *pdev)
{
struct resource *res;
+ int err;
/* don't run on non-XOs */
if (!machine_is_olpc())
return -ENODEV;
+ err = mfd_cell_enable(pdev);
+ if (err)
+ return err;
+
res = platform_get_resource(pdev, IORESOURCE_IO, 0);
if (!res) {
dev_err(&pdev->dev, "can't fetch device resource info\n");
return -EIO;
}
-
- if (!request_region(res->start, resource_size(res), DRV_NAME)) {
- dev_err(&pdev->dev, "can't request region\n");
- return -EIO;
- }
-
- if (strcmp(pdev->name, "cs5535-pms") == 0)
+ if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
pms_base = res->start;
- else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0)
acpi_base = res->start;
/* If we have both addresses, we can override the poweroff hook */
@@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev)
static int __devexit olpc_xo1_remove(struct platform_device *pdev)
{
- struct resource *r;
-
- r = platform_get_resource(pdev, IORESOURCE_IO, 0);
- release_region(r->start, resource_size(r));
+ mfd_cell_disable(pdev);
- if (strcmp(pdev->name, "cs5535-pms") == 0)
+ if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
pms_base = 0;
- else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0)
acpi_base = 0;
pm_power_off = NULL;
@@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev)
static struct platform_driver cs5535_pms_drv = {
.driver = {
- .name = "cs5535-pms",
+ .name = "olpc-xo1-pms",
.owner = THIS_MODULE,
},
.probe = olpc_xo1_probe,
@@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = {
static struct platform_driver cs5535_acpi_drv = {
.driver = {
- .name = "cs5535-acpi",
+ .name = "olpc-xo1-acpi",
.owner = THIS_MODULE,
},
.probe = olpc_xo1_probe,
@@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void)
{
int r;
- r = platform_driver_register(&cs5535_pms_drv);
+ r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms");
if (r)
return r;
- r = platform_driver_register(&cs5535_acpi_drv);
+ r = mfd_shared_platform_driver_register(&cs5535_acpi_drv,
+ "cs5535-acpi");
if (r)
- platform_driver_unregister(&cs5535_pms_drv);
+ mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
return r;
}
static void __exit olpc_xo1_exit(void)
{
- platform_driver_unregister(&cs5535_acpi_drv);
- platform_driver_unregister(&cs5535_pms_drv);
+ mfd_shared_platform_driver_unregister(&cs5535_acpi_drv);
+ mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
}
MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:olpc-xo1");
+MODULE_ALIAS("platform:cs5535-pms");
module_init(olpc_xo1_init);
module_exit(olpc_xo1_exit);
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c..7bc481508d0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
#include "vextern.h"
#define gtod vdso_vsyscall_gtod_data
@@ -111,6 +113,46 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
return 0;
}
+/*
+ * If the TSC is synchronized across all CPUs, read the current TSC
+ * and export its value in the nsec field of the timespec
+ */
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+ unsigned long seq;
+ union lttng_timespec *lts = (union lttng_timespec *) ts;
+
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ if (unlikely(!gtod->trace_clock_is_sync))
+ return vdso_fallback_gettime(CLOCK_TRACE, ts);
+ /*
+ * We don't protect the rdtsc with the rdtsc_barrier because
+ * we can't obtain with tracing that level of precision.
+ * The operation of recording an event is not atomic therefore
+ * the small chance of imprecision doesn't justify the overhead
+ * of a barrier.
+ */
+ /*
+ * TODO: check that vget_cycles(), using paravirt ops, will
+ * match the TSC read by get_cycles() at the kernel level.
+ */
+ lts->lttng_ts = vget_cycles();
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+
+ return 0;
+}
+
+/*
+ * Returns the cpu_khz, it needs to be a syscall because we can't access
+ * this value from userspace and it will only be called at the beginning
+ * of the tracing session
+ */
+notrace static noinline int do_trace_clock_freq(struct timespec *ts)
+{
+ return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts);
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
if (likely(gtod->sysctl_enabled))
@@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
return do_realtime_coarse(ts);
case CLOCK_MONOTONIC_COARSE:
return do_monotonic_coarse(ts);
+ case CLOCK_TRACE:
+ return do_trace_clock(ts);
+ case CLOCK_TRACE_FREQ:
+ return do_trace_clock_freq(ts);
+ default:
+ return -EINVAL;
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45f..e3839c74ec4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -974,6 +974,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.read_pmc = native_read_pmc,
.iret = xen_iret,
+ .nmi_return = xen_iret,
.irq_enable_sysexit = xen_sysexit,
#ifdef CONFIG_X86_64
.usergs_sysret32 = xen_sysret32,