48 files changed, 885 insertions, 289 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d30aa..b0389519b6d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,7 @@ config X86
 	select HAVE_READQ
 	select HAVE_WRITEQ
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_GET_CYCLES
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_PERF_EVENTS
@@ -27,9 +28,11 @@ config X86
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
 	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select HAVE_LTT_DUMP_TABLES
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_DMA_ATTRS
 	select HAVE_KRETPROBES
+	select HAVE_TRACE_CLOCK
 	select HAVE_OPTPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_C_RECORDMCOUNT
@@ -208,10 +211,12 @@ config HAVE_INTEL_TXT
 config X86_32_SMP
 	def_bool y
 	depends on X86_32 && SMP
+	select HAVE_UNSYNCHRONIZED_TSC
 
 config X86_64_SMP
 	def_bool y
 	depends on X86_64 && SMP
+	select HAVE_UNSYNCHRONIZED_TSC
 
 config X86_HT
 	def_bool y
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index 29cdcd02ead..accd6b42bd2 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,8 +8,11 @@
 #include <linux/shm.h>
 #include <linux/ipc.h>
 #include <linux/compat.h>
+#include <trace/ipc.h>
 #include <asm/sys_ia32.h>
 
+DEFINE_TRACE(ipc_call);
+
 asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
 			  compat_uptr_t ptr, u32 fifth)
 {
@@ -18,6 +21,8 @@ asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
 	version = call >> 16; /* hack for backward compatibility */
 	call &= 0xffff;
 
+	trace_ipc_call(call, first);
+
 	switch (call) {
 	case SEMOP:
 		/* struct sembuf is the same on 32 and 64bit :)) */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e95..d09bb03653f 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
-						int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+						u32 oldval, u32 newval)
 {
+	int ret = 0;
 
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
 	/* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
 		return -ENOSYS;
 #endif
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
+	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
 		     "2:\t.section .fixup, \"ax\"\n"
-		     "3:\tmov     %2, %0\n"
+		     "3:\tmov     %3, %0\n"
 		     "\tjmp     2b\n"
 		     "\t.previous\n"
 		     _ASM_EXTABLE(1b, 3b)
-		     : "=a" (oldval), "+m" (*uaddr)
-		     : "i" (-EFAULT), "r" (newval), "0" (oldval)
+		     : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+		     : "i" (-EFAULT), "r" (newval), "1" (oldval)
 		     : "memory"
 	);
 
-	return oldval;
+	*uval = oldval;
+	return ret;
 }
 
 #endif
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e27..9b1db108f9e 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,20 +1,9 @@
 #ifndef _ASM_X86_IDLE_H
 #define _ASM_X86_IDLE_H
 
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
-#ifdef CONFIG_X86_64
-void enter_idle(void);
-void exit_idle(void);
-#else /* !CONFIG_X86_64 */
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-#endif /* CONFIG_X86_64 */
+extern void enter_idle(void);
+extern void __exit_idle(void);
+extern void exit_idle(void);
 
 void c1e_remove_cpu(int cpu);
 
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..fdf897373e1 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -56,6 +56,61 @@ static inline void native_halt(void)
 
 #endif
 
+#ifdef CONFIG_X86_64
+/*
+ * Only returns from a trap or exception to a NMI context (intra-privilege
+ * level near return) to the same SS and CS segments. Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued. It takes care of modifying the eflags, rsp and returning to the
+ * previous function.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(rsp)  RIP
+ * 8(rsp)  CS
+ * 16(rsp) EFLAGS
+ * 24(rsp) RSP
+ * 32(rsp) SS
+ *
+ * Upon execution :
+ * Copy EIP to the top of the return stack
+ * Update top of return stack address
+ * Pop eflags into the eflags register
+ * Make the return stack current
+ * Near return (popping the return address from the return stack)
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
+						movq %rsp, %rax;	\
+						movq 24+8(%rax), %rsp;	\
+						pushq 0+8(%rax);	\
+						pushq 16+8(%rax);	\
+						movq (%rax), %rax;	\
+						popfq;			\
+						ret
+#else
+/*
+ * Protected mode only, no V8086. Implies that protected mode must
+ * be entered before NMIs or MCEs are enabled. Only returns from a trap or
+ * exception to a NMI context (intra-privilege level far return). Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(esp) EIP
+ * 4(esp) CS
+ * 8(esp) EFLAGS
+ *
+ * Upon execution :
+ * Copy the stack eflags to top of stack
+ * Pop eflags into the eflags register
+ * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
+						popfl;		\
+						lret $4
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -112,6 +167,7 @@ static inline unsigned long arch_local_irq_save(void)
 
 #define ENABLE_INTERRUPTS(x)	sti
 #define DISABLE_INTERRUPTS(x)	cli
+#define INTERRUPT_RETURN_NMI_SAFE	NATIVE_INTERRUPT_RETURN_NMI_SAFE
 
 #ifdef CONFIG_X86_64
 #define SWAPGS	swapgs
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/include/asm/kvm-mmutrace.h
index b60b4fdb3ed..42d117d0418 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/include/asm/kvm-mmutrace.h
@@ -217,9 +217,9 @@ TRACE_EVENT(
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_PATH asm
 #undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mmutrace
+#define TRACE_INCLUDE_FILE kvm-mmutrace
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/include/asm/kvm-trace.h
index 1357d7cf4ec..c1e151c092b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/include/asm/kvm-trace.h
@@ -701,9 +701,9 @@ TRACE_EVENT(kvm_emulate_insn,
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_PATH asm
 #undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
+#define TRACE_INCLUDE_FILE kvm-trace
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..1ef6906c179 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -962,6 +962,10 @@ extern void default_banner(void);
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
 		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
 
+#define INTERRUPT_RETURN_NMI_SAFE					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE,	\
+		  jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return)
+
 #define DISABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..3e0634cc127 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -181,6 +181,7 @@ struct pv_cpu_ops {
 	/* Normal iret.  Jump to this with the standard iret stack
 	   frame set up. */
 	void (*iret)(void);
+	void (*nmi_return)(void);
 
 	void (*swapgs)(void);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a..58a37ae7565 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_KERNEL_TRACE	9	/* kernel trace active */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -105,6 +106,7 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_KERNEL_TRACE	(1 << TIF_KERNEL_TRACE)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -121,18 +123,19 @@ struct thread_info {
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_KERNEL_TRACE)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\
-	 _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_SYSCALL_TRACEPOINT | _TIF_KERNEL_TRACE)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK							\
 	(0x0000FFFF &							\
 	 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|			\
-	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
+	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU|_TIF_KERNEL_TRACE))
 
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
new file mode 100644
index 00000000000..8ca73323366
--- /dev/null
+++ b/arch/x86/include/asm/trace-clock.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_X86_TRACE_CLOCK_H
+#define _ASM_X86_TRACE_CLOCK_H
+
+/*
+ * linux/arch/x86/include/asm/trace-clock.h
+ *
+ * Copyright (C) 2005,2006,2008
+ *   Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Trace clock definitions for x86.
+ */
+
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/atomic.h>
+
+/* Minimum duration of a probe, in cycles */
+#define TRACE_CLOCK_MIN_PROBE_DURATION 200
+#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION
+
+union lttng_timespec {
+	struct timespec ts;
+	u64 lttng_ts;
+};
+
+extern cycles_t trace_clock_async_tsc_read(void);
+
+extern int _trace_clock_is_sync;
+static inline int trace_clock_is_sync(void)
+{
+	return _trace_clock_is_sync;
+}
+
+static inline u32 trace_clock_read32(void)
+{
+	u32 cycles;
+
+	if (likely(trace_clock_is_sync()))
+		cycles = (u32)get_cycles(); /* only need the 32 LSB */
+	else
+		cycles = (u32)trace_clock_async_tsc_read();
+	return cycles;
+}
+
+static inline u64 trace_clock_read64(void)
+{
+	u64 cycles;
+
+	if (likely(trace_clock_is_sync()))
+		cycles = get_cycles();
+	else
+		cycles = trace_clock_async_tsc_read();
+	return cycles;
+}
+
+static inline u64 trace_clock_frequency(void)
+{
+	return (u64)cpu_khz * 1000;
+}
+
+static inline u32 trace_clock_freq_scale(void)
+{
+	return 1;
+}
+
+extern int get_trace_clock(void);
+extern void put_trace_clock(void);
+
+extern void set_trace_clock_is_sync(int state);
+
+#endif /* _ASM_X86_TRACE_CLOCK_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d0..28e56e1ec3c 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,18 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern unsigned long native_calibrate_tsc(void);
 
+static inline cycles_t get_cycles_rate(void)
+{
+	if (check_tsc_unstable())
+		return 0;
+	return (cycles_t)tsc_khz * 1000;
+}
+
+static inline void get_cycles_barrier(void)
+{
+	rdtsc_barrier();
+}
+
 /*
  * Boot-time check whether the TSCs are synchronized across
  * all CPUs/cores:
@@ -62,4 +74,10 @@ extern int notsc_setup(char *);
 extern void save_sched_clock_state(void);
 extern void restore_sched_clock_state(void);
 
+extern int test_tsc_synchronization(void);
+extern int _tsc_is_sync;
+static inline int tsc_is_sync(void)
+{
+	return _tsc_is_sync;
+}
 #endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826..06abe8f409a 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
 	u32		wall_time_nsec;
 
 	int		sysctl_enabled;
+	int		trace_clock_is_sync;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		cycle_t (*vread)(void);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fb..47b80f3ba4d 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -39,6 +39,14 @@ extern struct timezone sys_tz;
 
 extern void map_vsyscall(void);
 
+#ifdef CONFIG_X86_64
+extern void update_trace_clock_is_sync_vdso(void);
+#else
+static inline void update_trace_clock_is_sync_vdso(void)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2cd88..717cf9c620b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,6 +46,7 @@ obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-$(CONFIG_HAVE_TRACE_CLOCK)	+= trace-clock.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
@@ -66,9 +67,8 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y				:= apm_32.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_SMP)		+= smp.o
-obj-$(CONFIG_SMP)		+= smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP)		+= smpboot.o
 obj-$(CONFIG_SMP)		+= setup_percpu.o
-obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d74978..c604d23b4f3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -33,6 +33,7 @@
 #include <linux/dmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <trace/irq.h>
 
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
@@ -868,7 +869,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 	 */
 	exit_idle();
 	irq_enter();
+	trace_irq_entry(LOCAL_TIMER_VECTOR, regs, NULL);
 	local_apic_timer_interrupt();
+	trace_irq_exit(IRQ_HANDLED);
 	irq_exit();
 
 	set_irq_regs(old_regs);
@@ -1788,6 +1791,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 
 	exit_idle();
 	irq_enter();
+	trace_irq_entry(SPURIOUS_APIC_VECTOR, NULL, NULL);
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
 	 * if it is a vectored one.  Just in case...
@@ -1802,6 +1806,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	pr_info("spurious APIC interrupt on CPU#%d, "
 		"should never happen.\n", smp_processor_id());
+	trace_irq_exit(IRQ_HANDLED);
 	irq_exit();
 }
 
@@ -1814,6 +1819,7 @@ void smp_error_interrupt(struct pt_regs *regs)
 
 	exit_idle();
 	irq_enter();
+	trace_irq_entry(ERROR_APIC_VECTOR, NULL, NULL);
 	/* First tickle the hardware, only then report what went on. -- REW */
 	v = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
@@ -1834,6 +1840,7 @@ void smp_error_interrupt(struct pt_regs *regs)
 	 */
 	pr_debug("APIC error on CPU%d: %02x(%02x)\n",
 		smp_processor_id(), v , v1);
+	trace_irq_exit(IRQ_HANDLED);
 	irq_exit();
 }
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a74..60939d5f226 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
+#include <linux/idle.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -235,6 +236,7 @@
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
+#include <asm/idle.h>
 
 #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
 extern int (*console_blank_hook)(int);
@@ -947,10 +949,17 @@ recalc:
 				break;
 			}
 		}
+		enter_idle();
 		if (original_pm_idle)
 			original_pm_idle();
 		else
 			default_idle();
+		/*
+		 * In many cases the interrupt that ended idle
+		 * has already called exit_idle. But some idle
+		 * loops can be woken up without interrupt.
+		 */
+		__exit_idle();
 		local_irq_disable();
 		jiffies_since_last_check = jiffies - last_jiffies;
 		if (jiffies_since_last_check > idle_period)
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088dda37..677f8475d9d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,6 +111,7 @@ void foo(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd96..1aea11cd840 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -58,6 +58,7 @@ int main(void)
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
 	OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396b..6052f6f65a6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1069,6 +1069,7 @@ unsigned long kernel_eflags;
  * debugging, no special alignment required.
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
+EXPORT_PER_CPU_SYMBOL_GPL(orig_ist);
 
 #else	/* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..c8a6411d8ba 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include <trace/irq.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -402,8 +403,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
 {
 	exit_idle();
 	irq_enter();
+	trace_irq_entry(THERMAL_APIC_VECTOR, regs, NULL);
 	inc_irq_stat(irq_thermal_count);
 	smp_thermal_vector();
+	trace_irq_exit(IRQ_HANDLED);
 	irq_exit();
 	/* Ack only at the end to avoid potential reentry */
 	ack_APIC_irq();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723a6a1..6bed23e1c74 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -15,6 +15,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ltt-core.h>
 
 #include <asm/stacktrace.h>
 
@@ -253,6 +254,8 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 
 	if (!signr)
 		return;
+	if (in_nmi())
+		panic("Fatal exception in non-maskable interrupt");
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 	if (panic_on_oops)
@@ -277,6 +280,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
+#ifdef CONFIG_LTT
+	printk(KERN_EMERG "LTT NESTING LEVEL : %u", __get_cpu_var(ltt_nesting));
+	printk("\n");
+#endif
 	sysfs_printk_last_file();
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9ca3b0e343e..afd6d8ef78c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -80,6 +80,8 @@
 
 #define nr_syscalls ((syscall_table_size)/4)
 
+#define NMI_MASK 0x04000000
+
 #ifdef CONFIG_PREEMPT
 #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
@@ -321,8 +323,32 @@ END(ret_from_fork)
 	# userspace resumption stub bypassing syscall exit tracing
 	ALIGN
 	RING0_PTREGS_FRAME
+
 ret_from_exception:
 	preempt_stop(CLBR_ANY)
+	GET_THREAD_INFO(%ebp)
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
+	movb PT_CS(%esp), %al
+	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jae resume_userspace	# returning to v8086 or userspace
+	testl $NMI_MASK,TI_preempt_count(%ebp)
+	jz resume_kernel		/* Not nested over NMI ? */
+	testw $X86_EFLAGS_TF, PT_EFLAGS(%esp)
+	jnz resume_kernel		/*
+					 * If single-stepping an NMI handler,
+					 * use the normal iret path instead of
+					 * the popf/lret because lret would be
+					 * single-stepped. It should not
+					 * happen : it will reactivate NMIs
+					 * prematurely.
+					 */
+	TRACE_IRQS_IRET
+	RESTORE_REGS
+	addl $4, %esp			# skip orig_eax/error_code
+	CFI_ADJUST_CFA_OFFSET -4
+	INTERRUPT_RETURN_NMI_SAFE
+
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
@@ -906,6 +932,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)
 
+ENTRY(native_nmi_return)
+	NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ?
+END(native_nmi_return)
+
 ENTRY(native_irq_enable_sysexit)
 	sti
 	sysexit
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bbd5c80cb09..7800ff65aab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -163,6 +163,8 @@ GLOBAL(return_to_handler)
 #endif
 
 
+#define NMI_MASK 0x04000000
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif
@@ -515,6 +517,8 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
+	testl $_TIF_KERNEL_TRACE,%edx	/* Re-read : concurrently changed */
+	jnz ret_from_sys_call_trace
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
@@ -524,6 +528,16 @@ sysret_careful:
 	popq_cfi %rdi
 	jmp sysret_check
 
+ret_from_sys_call_trace:
+	TRACE_IRQS_ON
+	sti
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %rdi
+	movq %rsp,%rdi
+	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+
 	/* Handle a signal */
 sysret_signal:
 	TRACE_IRQS_ON
@@ -872,6 +886,9 @@ ENTRY(native_iret)
 	.section __ex_table,"a"
 	.quad native_iret, bad_iret
 	.previous
+
+ENTRY(native_nmi_return)
+	NATIVE_INTERRUPT_RETURN_NMI_SAFE
 #endif
 
 	.section .fixup,"ax"
@@ -924,6 +941,24 @@ retint_signal:
 	GET_THREAD_INFO(%rcx)
 	jmp retint_with_reschedule
 
+	/* Returning to kernel space from exception. */
+	/* rcx:	 threadinfo. interrupts off. */
+ENTRY(retexc_kernel)
+	testl $NMI_MASK,TI_preempt_count(%rcx)
+	jz retint_kernel		/* Not nested over NMI ? */
+	testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
+	jnz retint_kernel		/*
+					 * If single-stepping an NMI handler,
+					 * use the normal iret path instead of
+					 * the popf/lret because lret would be
+					 * single-stepped. It should not
+					 * happen : it will reactivate NMIs
+					 * prematurely.
+					 */
+	RESTORE_ARGS 0,8,0
+	TRACE_IRQS_IRETQ
+	INTERRUPT_RETURN_NMI_SAFE
+
 #ifdef CONFIG_PREEMPT
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
@@ -1361,12 +1396,18 @@ ENTRY(paranoid_exit)
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
+paranoid_restore_no_nmi:
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_restore:
+	GET_THREAD_INFO(%rcx)
 	TRACE_IRQS_IRETQ 0
+	testl $NMI_MASK,TI_preempt_count(%rcx)
+	jz paranoid_restore_no_nmi              /* Nested over NMI ? */
+	testw $X86_EFLAGS_TF,EFLAGS-0(%rsp)     /* trap flag? */
+	jnz paranoid_restore_no_nmi
 	RESTORE_ALL 8
-	jmp irq_return
+	INTERRUPT_RETURN_NMI_SAFE
 paranoid_userspace:
 	GET_THREAD_INFO(%rcx)
 	movl TI_flags(%rcx),%ebx
@@ -1465,7 +1506,7 @@ ENTRY(error_exit)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
 	testl %eax,%eax
-	jne retint_kernel
+	jne retexc_kernel
 	LOCKDEP_SYS_EXIT_IRQ
 	movl TI_flags(%rcx),%edx
 	movl $_TIF_WORK_MASK,%edi
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..1fc5da98373 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -156,6 +156,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
@@ -204,6 +205,7 @@ static void native_flush_tlb_single(unsigned long addr)
 
 /* These are in entry.S */
 extern void native_iret(void);
+extern void native_nmi_return(void);
 extern void native_irq_enable_sysexit(void);
 extern void native_usergs_sysret32(void);
 extern void native_usergs_sysret64(void);
@@ -373,6 +375,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.usergs_sysret64 = native_usergs_sysret64,
 #endif
 	.iret = native_iret,
+	.nmi_return = native_nmi_return,
 	.swapgs = native_swapgs,
 
 	.set_iopl_mask = native_set_iopl_mask,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab..ac372778bbc 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -1,10 +1,13 @@
-#include <asm/paravirt.h>
+#include <linux/stringify.h>
+#include <linux/irqflags.h>
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
 DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
@@ -41,6 +44,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, restore_fl);
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, nmi_return);
 		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93e..5339e67dc15 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -1,12 +1,15 @@
+#include <linux/irqflags.h>
+#include <linux/stringify.h>
 #include <asm/paravirt.h>
 #include <asm/asm-offsets.h>
-#include <linux/stringify.h>
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_cpu_ops, iret, "iretq");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -51,6 +54,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, nmi_return);
 		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff455419898..e0e4ffcad48 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/dmi.h>
 #include <linux/utsname.h>
 #include <trace/events/power.h>
+#include <trace/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <asm/cpu.h>
 #include <asm/system.h>
@@ -23,6 +24,8 @@
 #include <asm/i387.h>
 #include <asm/debugreg.h>
 
+DEFINE_TRACE(sched_kthread_create);
+
 struct kmem_cache *task_xstate_cachep;
 EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
@@ -278,6 +281,7 @@ extern void kernel_thread_helper(void);
 int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
 	struct pt_regs regs;
+	long pid;
 
 	memset(&regs, 0, sizeof(regs));
 
@@ -299,7 +303,10 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 	regs.flags = X86_EFLAGS_IF | 0x2;
 
 	/* Ok, create the new process.. */
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+	pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED,
+		      0, &regs, 0, NULL, NULL);
+	trace_sched_kthread_create(fn, pid);
+	return pid;
 }
 EXPORT_SYMBOL(kernel_thread);
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..256ba23211d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,9 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/idle.h>
+#include <trace/pm.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -59,6 +62,38 @@
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
+DEFINE_TRACE(pm_idle_exit);
+DEFINE_TRACE(pm_idle_entry);
+
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
+void enter_idle(void)
+{
+	percpu_write(is_idle, 1);
+	trace_pm_idle_entry();
+	notify_idle(IDLE_START);
+}
+EXPORT_SYMBOL_GPL(enter_idle);
+
+void __exit_idle(void)
+{
+	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+		return;
+	notify_idle(IDLE_END);
+	trace_pm_idle_exit();
+}
+EXPORT_SYMBOL_GPL(__exit_idle);
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+	/* idle loop has pid 0 */
+	if (current->pid)
+		return;
+	__exit_idle();
+}
+EXPORT_SYMBOL_GPL(exit_idle);
+
 /*
  * Return saved PC of a blocked thread.
  */
@@ -107,10 +142,18 @@ void cpu_idle(void)
 				play_dead();
 
 			local_irq_disable();
+			enter_idle();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
 			pm_idle();
 			start_critical_timings();
+
+			/*
+			 * In many cases the interrupt that ended idle
+			 * has already called exit_idle. But some idle
+			 * loops can be woken up without interrupt.
+			 */
+			__exit_idle();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bd387e8f73b..fbde94f1447 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -35,8 +35,10 @@
 #include <linux/tick.h>
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
+#include <linux/idle.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <trace/pm.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -51,37 +53,34 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 
+DEFINE_TRACE(pm_idle_exit);
+DEFINE_TRACE(pm_idle_entry);
+
 asmlinkage extern void ret_from_fork(void);
 
 DEFINE_PER_CPU(unsigned long, old_rsp);
 static DEFINE_PER_CPU(unsigned char, is_idle);
 
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-	atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-
 void enter_idle(void)
 {
 	percpu_write(is_idle, 1);
-	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+	/*
+	 * Trace last event before calling notifiers. Notifiers flush
+	 * data from buffers before going to idle.
+	 */
+	trace_pm_idle_entry();
+	notify_idle(IDLE_START);
 }
+EXPORT_SYMBOL_GPL(enter_idle);
 
-static void __exit_idle(void)
+void __exit_idle(void)
 {
 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
 		return;
-	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+	notify_idle(IDLE_END);
+	trace_pm_idle_exit();
 }
+EXPORT_SYMBOL_GPL(__exit_idle);
 
 /* Called from interrupts to signify idle end */
 void exit_idle(void)
@@ -91,6 +90,7 @@ void exit_idle(void)
 		return;
 	__exit_idle();
 }
+EXPORT_SYMBOL_GPL(exit_idle);
 
 #ifndef CONFIG_SMP
 static inline void play_dead(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72..ee3024d4f61 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
 #include <linux/signal.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
+#include <trace/syscall.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -152,6 +153,9 @@ static const int arg_offs_table[] = {
 				  X86_EFLAGS_DF | X86_EFLAGS_OF |	\
 				  X86_EFLAGS_RF | X86_EFLAGS_AC))
 
+DEFINE_TRACE(syscall_entry);
+DEFINE_TRACE(syscall_exit);
+
 /*
  * Determines whether a value may be installed in a segment register.
  */
@@ -1361,6 +1365,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SINGLESTEP))
 		regs->flags |= X86_EFLAGS_TF;
 
+	trace_syscall_entry(regs, regs->orig_ax);
+
 	/* do the secure computing check first */
 	secure_computing(regs->orig_ax);
 
@@ -1396,6 +1402,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
+	trace_syscall_exit(regs->ax);
+
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d600829..5e74f6aa3c0 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -1,8 +1,11 @@
 /* System call table for x86-64. */
 
 #include <linux/linkage.h>
+#include <linux/module.h>
 #include <linux/sys.h>
 #include <linux/cache.h>
+#include <linux/marker.h>
+#include <linux/kallsyms.h>
 #include <asm/asm-offsets.h>
 
 #define __NO_STUBS
@@ -27,3 +30,18 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
 #include <asm/unistd_64.h>
 };
+
+void ltt_dump_sys_call_table(void *call_data)
+{
+	int i;
+	char namebuf[KSYM_NAME_LEN];
+
+	for (i = 0; i < __NR_syscall_max + 1; i++) {
+		sprint_symbol(namebuf, (unsigned long)sys_call_table[i]);
+		__trace_mark(0, syscall_state, sys_call_table,
+			call_data,
+			"id %d address %p symbol %s",
+			i, (void*)sys_call_table[i], namebuf);
+	}
+}
+EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table);
diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c
new file mode 100644
index 00000000000..47539e28276
--- /dev/null
+++ b/arch/x86/kernel/trace-clock.c
@@ -0,0 +1,302 @@
+/*
+ * arch/x86/kernel/trace-clock.c
+ *
+ * Trace clock for x86.
+ *
+ * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>, October 2008
+ */
+
+#include <linux/module.h>
+#include <linux/trace-clock.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/cpu.h>
+#include <linux/posix-timers.h>
+#include <asm/vgtod.h>
+
+static cycles_t trace_clock_last_tsc;
+static DEFINE_PER_CPU(struct timer_list, update_timer);
+static DEFINE_SPINLOCK(async_tsc_lock);
+static int async_tsc_refcount;	/* Number of readers */
+static int async_tsc_enabled;	/* Async TSC enabled on all online CPUs */
+
+int _trace_clock_is_sync = 1;
+EXPORT_SYMBOL_GPL(_trace_clock_is_sync);
+
+/*
+ * Is the trace clock being used by user-space ? We leave the trace clock active
+ * as soon as user-space starts using it. We never unref the trace clock
+ * reference taken by user-space.
+ */
+static atomic_t user_trace_clock_ref;
+
+/*
+ * Called by check_tsc_sync_source from CPU hotplug.
+ */
+void set_trace_clock_is_sync(int state)
+{
+	_trace_clock_is_sync = state;
+	update_trace_clock_is_sync_vdso();
+}
+
+#if BITS_PER_LONG == 64
+static cycles_t read_last_tsc(void)
+{
+	return trace_clock_last_tsc;
+}
+#else
+/*
+ * A cmpxchg64 update can happen concurrently. Based on the assumption that
+ * two cmpxchg64 will never update it to the same value (the count always
+ * increases), reading it twice insures that we read a coherent value with the
+ * same "sequence number".
+ */
+static cycles_t read_last_tsc(void)
+{
+	cycles_t val1, val2;
+
+	val1 = trace_clock_last_tsc;
+	for (;;) {
+		val2 = val1;
+		barrier();
+		val1 = trace_clock_last_tsc;
+		if (likely(val1 == val2))
+			break;
+	}
+	return val1;
+}
+#endif
+
+/*
+ * Support for architectures with non-sync TSCs.
+ * When the local TSC is discovered to lag behind the highest TSC counter, we
+ * increment the TSC count of an amount that should be, ideally, lower than the
+ * execution time of this routine, in cycles : this is the granularity we look
+ * for : we must be able to order the events.
+ */
+notrace cycles_t trace_clock_async_tsc_read(void)
+{
+	cycles_t new_tsc, last_tsc;
+
+	WARN_ON(!async_tsc_refcount || !async_tsc_enabled);
+	new_tsc = get_cycles();
+	last_tsc = read_last_tsc();
+	do {
+		if (new_tsc < last_tsc)
+			new_tsc = last_tsc + TRACE_CLOCK_MIN_PROBE_DURATION;
+		/*
+		 * If cmpxchg fails with a value higher than the new_tsc, don't
+		 * retry : the value has been incremented and the events
+		 * happened almost at the same time.
+		 * We must retry if cmpxchg fails with a lower value :
+		 * it means that we are the CPU with highest frequency and
+		 * therefore MUST update the value.
+		 */
+		last_tsc = cmpxchg64(&trace_clock_last_tsc, last_tsc, new_tsc);
+	} while (unlikely(last_tsc < new_tsc));
+	return new_tsc;
+}
+EXPORT_SYMBOL_GPL(trace_clock_async_tsc_read);
+
+static void update_timer_ipi(void *info)
+{
+	(void)trace_clock_async_tsc_read();
+}
+
+/*
+ * update_timer_fct : - Timer function to resync the clocks
+ * @data: unused
+ *
+ * Fires every jiffy.
+ */
+static void update_timer_fct(unsigned long data)
+{
+	(void)trace_clock_async_tsc_read();
+	mod_timer_pinned(&per_cpu(update_timer, smp_processor_id()),
+			 jiffies + 1);
+}
+
+static void enable_trace_clock(int cpu)
+{
+	init_timer(&per_cpu(update_timer, cpu));
+	per_cpu(update_timer, cpu).function = update_timer_fct;
+	per_cpu(update_timer, cpu).expires = jiffies + 1;
+	smp_call_function_single(cpu, update_timer_ipi, NULL, 1);
+	add_timer_on(&per_cpu(update_timer, cpu), cpu);
+}
+
+static void disable_trace_clock(int cpu)
+{
+	del_timer_sync(&per_cpu(update_timer, cpu));
+}
+
+/*
+ * 	hotcpu_callback - CPU hotplug callback
+ * 	@nb: notifier block
+ * 	@action: hotplug action to take
+ * 	@hcpu: CPU number
+ *
+ * 	Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+				unsigned long action,
+				void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+	int cpu;
+
+	spin_lock(&async_tsc_lock);
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		/*
+		 * trace_clock_is_sync() is updated by set_trace_clock_is_sync()
+		 * code, protected by cpu hotplug disable.
+		 * It is ok to let the hotplugged CPU read the timebase before
+		 * the CPU_ONLINE notification. It's just there to give a
+		 * maximum bound to the TSC error.
+		 */
+		if (async_tsc_refcount && !trace_clock_is_sync()) {
+			if (!async_tsc_enabled) {
+				async_tsc_enabled = 1;
+				for_each_online_cpu(cpu)
+					enable_trace_clock(cpu);
+			} else {
+				enable_trace_clock(hotcpu);
+			}
+		}
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		if (!async_tsc_refcount && num_online_cpus() == 1)
+			set_trace_clock_is_sync(1);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		/*
+		 * We cannot stop the trace clock on other CPUs when readers are
+		 * active even if we go back to a synchronized state (1 CPU)
+		 * because the CPU left could be the one lagging behind.
+		 */
+		if (async_tsc_refcount && async_tsc_enabled)
+			disable_trace_clock(hotcpu);
+		if (!async_tsc_refcount && num_online_cpus() == 1)
+			set_trace_clock_is_sync(1);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	spin_unlock(&async_tsc_lock);
+
+	return NOTIFY_OK;
+}
+
+int get_trace_clock(void)
+{
+	int cpu;
+
+	if (!trace_clock_is_sync()) {
+		printk(KERN_WARNING
+			"Trace clock falls back on cache-line bouncing\n"
+			"workaround due to non-synchronized TSCs.\n"
+			"This workaround preserves event order across CPUs.\n"
+			"Please consider disabling Speedstep or PowerNow and\n"
+			"using kernel parameters "
+			"\"force_tsc_sync=1 idle=poll\"\n"
+			"for accurate and fast tracing clock source.\n");
+	}
+
+	get_online_cpus();
+	spin_lock(&async_tsc_lock);
+	if (async_tsc_refcount++ || trace_clock_is_sync())
+		goto end;
+
+	async_tsc_enabled = 1;
+	for_each_online_cpu(cpu)
+		enable_trace_clock(cpu);
+end:
+	spin_unlock(&async_tsc_lock);
+	put_online_cpus();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_trace_clock);
+
+void put_trace_clock(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	spin_lock(&async_tsc_lock);
+	WARN_ON(async_tsc_refcount <= 0);
+	if (async_tsc_refcount != 1 || !async_tsc_enabled)
+		goto end;
+
+	for_each_online_cpu(cpu)
+		disable_trace_clock(cpu);
+	async_tsc_enabled = 0;
+end:
+	async_tsc_refcount--;
+	if (!async_tsc_refcount && num_online_cpus() == 1)
+		set_trace_clock_is_sync(1);
+	spin_unlock(&async_tsc_lock);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(put_trace_clock);
+
+static int posix_get_trace(clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+	int ret;
+
+	/*
+	 * Yes, there is a race here that would lead to refcount being
+	 * incremented more than once, but all we care is to leave the trace
+	 * clock active forever, so precise accounting is not needed.
+	 */
+	if (unlikely(!atomic_read(&user_trace_clock_ref))) {
+		ret = get_trace_clock();
+		if (ret)
+			return ret;
+		atomic_inc(&user_trace_clock_ref);
+	}
+	lts->lttng_ts = trace_clock_read64();
+	return 0;
+}
+
+static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+	lts->lttng_ts = trace_clock_frequency();
+	return 0;
+}
+
+static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp)
+{
+	union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+	lts->lttng_ts = TRACE_CLOCK_RES;
+	return 0;
+}
+
+static __init int init_unsync_trace_clock(void)
+{
+	struct k_clock clock_trace = {
+		.clock_getres = posix_get_trace_res,
+		.clock_get = posix_get_trace,
+	};
+	struct k_clock clock_trace_freq = {
+		.clock_getres = posix_get_trace_res,
+		.clock_get = posix_get_trace_freq,
+	};
+
+	register_posix_clock(CLOCK_TRACE, &clock_trace);
+	register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq);
+
+	hotcpu_notifier(hotcpu_callback, 4);
+	return 0;
+}
+early_initcall(init_unsync_trace_clock);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..d45030679f9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -31,6 +31,7 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <trace/trap.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -52,6 +53,7 @@
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/traps.h>
+#include <asm/unistd.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
 #include <asm/mce.h>
@@ -76,11 +78,21 @@ char ignore_fpu_irq;
  * F0 0F bug workaround.
  */
 gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
+
+extern unsigned long sys_call_table[];
+extern unsigned long syscall_table_size;
+
 #endif
 
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
+/*
+ * Also used in arch/x86/mm/fault.c.
+ */
+DEFINE_TRACE(trap_entry);
+DEFINE_TRACE(trap_exit);
+
 static int ignore_nmis;
 
 int unknown_nmi_panic;
@@ -122,6 +134,8 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 {
 	struct task_struct *tsk = current;
 
+	trace_trap_entry(regs, trapnr);
+
 #ifdef CONFIG_X86_32
 	if (regs->flags & X86_VM_MASK) {
 		/*
@@ -168,7 +182,7 @@ trap_signal:
 		force_sig_info(signr, info, tsk);
 	else
 		force_sig(signr, tsk);
-	return;
+	goto end;
 
 kernel_trap:
 	if (!fixup_exception(regs)) {
@@ -176,15 +190,17 @@ kernel_trap:
 		tsk->thread.trap_no = trapnr;
 		die(str, regs, error_code);
 	}
-	return;
+	goto end;
 
 #ifdef CONFIG_X86_32
 vm86_trap:
 	if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
 						error_code, trapnr))
 		goto trap_signal;
-	return;
+	goto end;
 #endif
+end:
+	trace_trap_exit();
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -285,7 +301,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		printk("\n");
 	}
 
+	trace_trap_entry(regs, 13);
 	force_sig(SIGSEGV, tsk);
+	trace_trap_exit();
 	return;
 
 #ifdef CONFIG_X86_32
@@ -371,9 +389,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
+	trace_trap_entry(regs, 2);
+
 	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
 			NOTIFY_STOP)
-		return;
+		goto end;
 #ifdef CONFIG_MCA
 	/*
 	 * Might actually be able to figure out what the guilty party
@@ -381,7 +401,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 */
 	if (MCA_bus) {
 		mca_handle_nmi();
-		return;
+		goto end;
 	}
 #endif
 	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
@@ -392,19 +412,23 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 		panic("NMI: Not continuing");
 
 	pr_emerg("Dazed and confused, but trying to continue\n");
+end:
+	trace_trap_exit();
 }
 
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 
+	trace_trap_entry(regs, 2);
+
 	/*
 	 * CPU-specific NMI must be processed before non-CPU-specific
 	 * NMI, otherwise we may lose it, because the CPU-specific
 	 * NMI can not be detected/processed on other CPUs.
 	 */
 	if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
+		goto end;
 
 	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
 	raw_spin_lock(&nmi_reason_lock);
@@ -423,11 +447,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		reassert_nmi();
 #endif
 		raw_spin_unlock(&nmi_reason_lock);
-		return;
+		goto end;
 	}
 	raw_spin_unlock(&nmi_reason_lock);
 
 	unknown_nmi_error(reason, regs);
+end:
+	trace_trap_exit();
 }
 
 dotraplinkage notrace __kprobes void
@@ -570,8 +596,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	preempt_conditional_sti(regs);
 
 	if (regs->flags & X86_VM_MASK) {
+		trace_trap_entry(regs, 1);
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
+		trace_trap_exit();
 		preempt_conditional_cli(regs);
 		return;
 	}
@@ -589,13 +617,32 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		regs->flags &= ~X86_EFLAGS_TF;
 	}
 	si_code = get_si_code(tsk->thread.debugreg6);
-	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) {
+		trace_trap_entry(regs, 1);
 		send_sigtrap(tsk, regs, error_code, si_code);
+		trace_trap_exit();
+	}
 	preempt_conditional_cli(regs);
 
 	return;
 }
 
+#ifdef CONFIG_X86_32
+void ltt_dump_sys_call_table(void *call_data)
+{
+	int i;
+	char namebuf[KSYM_NAME_LEN];
+
+	for (i = 0; i < NR_syscalls; i++) {
+		sprint_symbol(namebuf, sys_call_table[i]);
+		__trace_mark(0, syscall_state, sys_call_table, call_data,
+			"id %d address %p symbol %s",
+			i, (void*)sys_call_table[i], namebuf);
+	}
+}
+EXPORT_SYMBOL_GPL(ltt_dump_sys_call_table);
+#endif
+
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -701,11 +748,13 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
+	trace_trap_entry(regs, 16);
 	conditional_sti(regs);
 #if 0
 	/* No need to warn about this any longer. */
 	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
 #endif
+	trace_trap_exit();
 }
 
 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
@@ -738,6 +787,21 @@ void __math_state_restore(void)
 	tsk->fpu_counter++;
 }
 
+void ltt_dump_idt_table(void *call_data)
+{
+	int i;
+	char namebuf[KSYM_NAME_LEN];
+
+	for (i = 0; i < IDT_ENTRIES; i++) {
+		unsigned long address = gate_offset(idt_table[i]);
+		sprint_symbol(namebuf, address);
+		__trace_mark(0, irq_state, idt_table, call_data,
+			"irq %d address %p symbol %s",
+			i, (void *)address, namebuf);
+	}
+}
+EXPORT_SYMBOL_GPL(ltt_dump_idt_table);
+
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
deleted file mode 100644
index 0aa5fed8b9e..00000000000
--- a/arch/x86/kernel/tsc_sync.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * check TSC synchronization.
- *
- * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
- *
- * We check whether all boot CPUs have their TSC's synchronized,
- * print a warning if not and turn off the TSC clock-source.
- *
- * The warp-check is point-to-point between two CPUs, the CPU
- * initiating the bootup is the 'source CPU', the freshly booting
- * CPU is the 'target CPU'.
- *
- * Only two CPUs may participate - they can enter in any order.
- * ( The serial nature of the boot logic and the CPU hotplug lock
- *   protects against more than 2 CPUs entering this code. )
- */
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/nmi.h>
-#include <asm/tsc.h>
-
-/*
- * Entry/exit counters that make sure that both CPUs
- * run the measurement code at once:
- */
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
-
-/*
- * We use a raw spinlock in this exceptional case, because
- * we want to have the fastest, inlined, non-debug version
- * of a critical section, to be able to prove TSC time-warps:
- */
-static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
-
-/*
- * TSC-warp measurement loop running on both CPUs:
- */
-static __cpuinit void check_tsc_warp(void)
-{
-	cycles_t start, now, prev, end;
-	int i;
-
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
-	/*
-	 * The measurement runs for 20 msecs:
-	 */
-	end = start + tsc_khz * 20ULL;
-	now = start;
-
-	for (i = 0; ; i++) {
-		/*
-		 * We take the global lock, measure TSC, save the
-		 * previous TSC that was measured (possibly on
-		 * another CPU) and update the previous TSC timestamp.
-		 */
-		arch_spin_lock(&sync_lock);
-		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
-		last_tsc = now;
-		arch_spin_unlock(&sync_lock);
-
-		/*
-		 * Be nice every now and then (and also check whether
-		 * measurement is done [we also insert a 10 million
-		 * loops safety exit, so we dont lock up in case the
-		 * TSC readout is totally broken]):
-		 */
-		if (unlikely(!(i & 7))) {
-			if (now > end || i > 10000000)
-				break;
-			cpu_relax();
-			touch_nmi_watchdog();
-		}
-		/*
-		 * Outside the critical section we can now see whether
-		 * we saw a time-warp of the TSC going backwards:
-		 */
-		if (unlikely(prev > now)) {
-			arch_spin_lock(&sync_lock);
-			max_warp = max(max_warp, prev - now);
-			nr_warps++;
-			arch_spin_unlock(&sync_lock);
-		}
-	}
-	WARN(!(now-start),
-		"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
-			now-start, end-start);
-}
-
-/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
- */
-void __cpuinit check_tsc_sync_source(int cpu)
-{
-	int cpus = 2;
-
-	/*
-	 * No need to check if we already know that the TSC is not
-	 * synchronized:
-	 */
-	if (unsynchronized_tsc())
-		return;
-
-	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
-			pr_info(
-			"Skipped synchronization checks as TSC is reliable.\n");
-		return;
-	}
-
-	/*
-	 * Reset it - in case this is a second bootup:
-	 */
-	atomic_set(&stop_count, 0);
-
-	/*
-	 * Wait for the target to arrive:
-	 */
-	while (atomic_read(&start_count) != cpus-1)
-		cpu_relax();
-	/*
-	 * Trigger the target to continue into the measurement too:
-	 */
-	atomic_inc(&start_count);
-
-	check_tsc_warp();
-
-	while (atomic_read(&stop_count) != cpus-1)
-		cpu_relax();
-
-	if (nr_warps) {
-		pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
-			smp_processor_id(), cpu);
-		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
-			   "turning off TSC clock.\n", max_warp);
-		mark_tsc_unstable("check_tsc_sync_source failed");
-	} else {
-		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
-			smp_processor_id(), cpu);
-	}
-
-	/*
-	 * Reset it - just in case we boot another CPU later:
-	 */
-	atomic_set(&start_count, 0);
-	nr_warps = 0;
-	max_warp = 0;
-	last_tsc = 0;
-
-	/*
-	 * Let the target continue with the bootup:
-	 */
-	atomic_inc(&stop_count);
-}
-
-/*
- * Freshly booted CPUs call into this:
- */
-void __cpuinit check_tsc_sync_target(void)
-{
-	int cpus = 2;
-
-	if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
-		return;
-
-	/*
-	 * Register this CPU's participation and wait for the
-	 * source CPU to start the measurement:
-	 */
-	atomic_inc(&start_count);
-	while (atomic_read(&start_count) != cpus)
-		cpu_relax();
-
-	check_tsc_warp();
-
-	/*
-	 * Ok, we are done:
-	 */
-	atomic_inc(&stop_count);
-
-	/*
-	 * Wait for the source CPU to print stuff:
-	 */
-	while (atomic_read(&stop_count) != cpus)
-		cpu_relax();
-}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..df18f14c473 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 
 #define __vsyscall(nr) \
 		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
 	.lock = SEQLOCK_UNLOCKED,
 	.sysctl_enabled = 1,
+	.trace_clock_is_sync = 1,
 };
 
 void update_vsyscall_tz(void)
@@ -73,6 +76,16 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
+void update_trace_clock_is_sync_vdso(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso);
+
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
@@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac..f894af174b8 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,7 +32,7 @@
 #include "irq.h"
 
 #include <linux/kvm_host.h>
-#include "trace.h"
+#include <asm/kvm-trace.h>
 
 static void pic_irq_request(struct kvm *kvm, int level);
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d365..58bcbce5b02 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -36,7 +36,7 @@
 #include <asm/atomic.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
-#include "trace.h"
+#include <asm/kvm-trace.h>
 #include "x86.h"
 
 #ifndef CONFIG_X86_64
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d4..3612044ed1f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -163,7 +163,7 @@ module_param(oos_shadow, bool, 0644);
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
-#include "mmutrace.h"
+#include <asm/kvm-mmutrace.h>
 
 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d8a15a17d76..42342e8cc18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,7 +34,7 @@
 #include <asm/kvm_para.h>
 
 #include <asm/virtext.h>
-#include "trace.h"
+#include <asm/kvm-trace.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb8..d12b42e234b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,7 +40,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 
-#include "trace.h"
+#include <asm/kvm-trace.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85b..6a8cb6fe5c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -47,7 +47,7 @@
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
-#include "trace.h"
+#include <asm/kvm-trace.h>
 
 #include <asm/debugreg.h>
 #include <asm/msr.h>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0cc0..07f7a272226 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1276,6 +1276,7 @@ __init void lguest_init(void)
 	pv_cpu_ops.cpuid = lguest_cpuid;
 	pv_cpu_ops.load_idt = lguest_load_idt;
 	pv_cpu_ops.iret = lguest_iret;
+	pv_cpu_ops.nmi_return = lguest_iret;
 	pv_cpu_ops.load_sp0 = lguest_load_sp0;
 	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
 	pv_cpu_ops.set_ldt = lguest_set_ldt;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1..abeb09914d5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <trace/fault.h>
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -35,6 +36,11 @@ enum x86_pf_error_code {
 	PF_INSTR	=		1 << 4,
 };
 
+DEFINE_TRACE(page_fault_entry);
+DEFINE_TRACE(page_fault_exit);
+DEFINE_TRACE(page_fault_nosem_entry);
+DEFINE_TRACE(page_fault_nosem_exit);
+
 /*
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
@@ -719,6 +725,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		if (is_errata100(regs, address))
 			return;
 
+		trace_page_fault_nosem_entry(regs, 14, address);
 		if (unlikely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
 
@@ -728,6 +735,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.trap_no	= 14;
 
 		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+		trace_page_fault_nosem_exit();
 
 		return;
 	}
@@ -1130,7 +1138,9 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
+	trace_page_fault_entry(regs, 14, mm, vma, address, write);
 	fault = handle_mm_fault(mm, vma, address, flags);
+	trace_page_fault_exit(fault);
 
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8..14b9317eccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <trace/irq.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -141,6 +142,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
 	f = &flush_state[sender];
 
+	trace_irq_entry(sender, regs, NULL);
+
 	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
 		goto out;
 		/*
@@ -167,6 +170,7 @@ out:
 	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
 	smp_mb__after_clear_bit();
 	inc_irq_stat(irq_tlb_count);
+	trace_irq_exit(IRQ_HANDLED);
 }
 
 static void flush_tlb_others_ipi(const struct cpumask *cpumask,
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 127775696d6..99513642a0e 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
+#include <linux/mfd/core.h>
 
 #include <asm/io.h>
 #include <asm/olpc.h>
@@ -56,25 +57,24 @@ static void xo1_power_off(void)
 static int __devinit olpc_xo1_probe(struct platform_device *pdev)
 {
 	struct resource *res;
+	int err;
 
 	/* don't run on non-XOs */
 	if (!machine_is_olpc())
 		return -ENODEV;
 
+	err = mfd_cell_enable(pdev);
+	if (err)
+		return err;
+
 	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "can't fetch device resource info\n");
 		return -EIO;
 	}
-
-	if (!request_region(res->start, resource_size(res), DRV_NAME)) {
-		dev_err(&pdev->dev, "can't request region\n");
-		return -EIO;
-	}
-
-	if (strcmp(pdev->name, "cs5535-pms") == 0)
+	if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
 		pms_base = res->start;
-	else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+	else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0)
 		acpi_base = res->start;
 
 	/* If we have both addresses, we can override the poweroff hook */
@@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev)
 
 static int __devexit olpc_xo1_remove(struct platform_device *pdev)
 {
-	struct resource *r;
-
-	r = platform_get_resource(pdev, IORESOURCE_IO, 0);
-	release_region(r->start, resource_size(r));
+	mfd_cell_disable(pdev);
 
-	if (strcmp(pdev->name, "cs5535-pms") == 0)
+	if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
 		pms_base = 0;
-	else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+	else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0)
 		acpi_base = 0;
 
 	pm_power_off = NULL;
@@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev)
 
 static struct platform_driver cs5535_pms_drv = {
 	.driver = {
-		.name = "cs5535-pms",
+		.name = "olpc-xo1-pms",
 		.owner = THIS_MODULE,
 	},
 	.probe = olpc_xo1_probe,
@@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = {
 
 static struct platform_driver cs5535_acpi_drv = {
 	.driver = {
-		.name = "cs5535-acpi",
+		.name = "olpc-xo1-acpi",
 		.owner = THIS_MODULE,
 	},
 	.probe = olpc_xo1_probe,
@@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void)
 {
 	int r;
 
-	r = platform_driver_register(&cs5535_pms_drv);
+	r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms");
 	if (r)
 		return r;
 
-	r = platform_driver_register(&cs5535_acpi_drv);
+	r = mfd_shared_platform_driver_register(&cs5535_acpi_drv,
+			"cs5535-acpi");
 	if (r)
-		platform_driver_unregister(&cs5535_pms_drv);
+		mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
 
 	return r;
 }
 
 static void __exit olpc_xo1_exit(void)
 {
-	platform_driver_unregister(&cs5535_acpi_drv);
-	platform_driver_unregister(&cs5535_pms_drv);
+	mfd_shared_platform_driver_unregister(&cs5535_acpi_drv);
+	mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
 }
 
 MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:olpc-xo1");
+MODULE_ALIAS("platform:cs5535-pms");
 
 module_init(olpc_xo1_init);
 module_exit(olpc_xo1_exit);
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c..7bc481508d0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
 #include "vextern.h"
 
 #define gtod vdso_vsyscall_gtod_data
@@ -111,6 +113,46 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 	return 0;
 }
 
+/*
+ * If the TSC is synchronized across all CPUs, read the current TSC
+ * and export its value in the nsec field of the timespec
+ */
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+	unsigned long seq;
+	union lttng_timespec *lts = (union lttng_timespec *) ts;
+
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		if (unlikely(!gtod->trace_clock_is_sync))
+			return vdso_fallback_gettime(CLOCK_TRACE, ts);
+		/*
+		 * We don't protect the rdtsc with the rdtsc_barrier because
+		 * we can't obtain with tracing that level of precision.
+		 * The operation of recording an event is not atomic therefore
+		 * the small chance of imprecision doesn't justify the overhead
+		 * of a barrier.
+		 */
+		/*
+		 * TODO: check that vget_cycles(), using paravirt ops, will
+		 * match the TSC read by get_cycles() at the kernel level.
+		 */
+		lts->lttng_ts = vget_cycles();
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+
+	return 0;
+}
+
+/*
+ * Returns the cpu_khz, it needs to be a syscall because we can't access
+ * this value from userspace and it will only be called at the beginning
+ * of the tracing session
+ */
+notrace static noinline int do_trace_clock_freq(struct timespec *ts)
+{
+	return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts);
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled))
@@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 			return do_realtime_coarse(ts);
 		case CLOCK_MONOTONIC_COARSE:
 			return do_monotonic_coarse(ts);
+		case CLOCK_TRACE:
+			return do_trace_clock(ts);
+		case CLOCK_TRACE_FREQ:
+			return do_trace_clock_freq(ts);
+		default:
+			return -EINVAL;
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542efe45f..e3839c74ec4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -974,6 +974,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.read_pmc = native_read_pmc,
 
 	.iret = xen_iret,
+	.nmi_return = xen_iret,
 	.irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,