25 files changed, 2740 insertions, 69 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba3..c039580ba3b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_BINFMT_ELF) += elfcore.o
@@ -99,7 +100,10 @@ obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_MARKERS) += ltt-channels.o
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b..0d9a3444614 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -514,6 +514,8 @@ struct files_struct *get_files_struct(struct task_struct *task)
 	return files;
 }
 
+EXPORT_SYMBOL(get_files_struct);
+
 void put_files_struct(struct files_struct *files)
 {
 	struct fdtable *fdt;
@@ -535,6 +537,8 @@ void put_files_struct(struct files_struct *files)
 	}
 }
 
+EXPORT_SYMBOL(put_files_struct);
+
 void reset_files_struct(struct files_struct *files)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152dd..5bb0bb18434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@ int max_threads;		/* tunable limit on nr_threads */
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+EXPORT_SYMBOL(tasklist_lock);
 
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
@@ -1250,6 +1251,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
+	/*
+	 * The state of the parent's TIF_KTRACE flag may have changed
+	 * since it was copied in dup_task_struct() so we re-copy it here.
+	 */
+	if (test_thread_flag(TIF_KERNEL_TRACE))
+		set_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+	else
+		clear_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d5065e8283d..3cd76c15b09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 	return NULL;
 }
 
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+				      u32 uval, u32 newval)
 {
-	u32 curval;
+	int ret;
 
 	pagefault_disable();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 	pagefault_enable();
 
-	return curval;
+	return ret;
 }
 
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				struct task_struct *task, int set_waiters)
 {
 	int lock_taken, ret, ownerdied = 0;
-	u32 uval, newval, curval;
+	u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
 	ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
 	 * the locks. It will most likely not succeed.
 	 */
-	newval = task_pid_vnr(task);
+	newval = vpid;
 	if (set_waiters)
 		newval |= FUTEX_WAITERS;
 
-	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 		return -EFAULT;
 
 	/*
 	 * Detect deadlocks.
 	 */
-	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+	if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 		return -EDEADLK;
 
 	/*
@@ -723,14 +722,12 @@ retry:
 	 */
 	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
 		/* Keep the OWNER_DIED bit */
-		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+		newval = (curval & ~FUTEX_TID_MASK) | vpid;
 		ownerdied = 0;
 		lock_taken = 1;
 	}
 
-	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 		return -EFAULT;
 	if (unlikely(curval != uval))
 		goto retry;
@@ -775,6 +772,24 @@ retry:
 	return ret;
 }
 
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q:	The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+	struct futex_hash_bucket *hb;
+
+	if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+			|| plist_node_empty(&q->list)))
+		return;
+
+	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+	plist_del(&q->list, &hb->chain);
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
 	 */
 	get_task_struct(p);
 
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
 	 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 
 		newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			ret = -EFAULT;
 		else if (curval != uval)
 			ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 	 * There is no waiter, so we unlock the futex. The owner died
 	 * bit has not to be preserved here. We are the owner:
 	 */
-	oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
-	if (oldval == -EFAULT)
-		return oldval;
+	if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+		return -EFAULT;
 	if (oldval != uval)
 		return -EAGAIN;
 
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 		plist_del(&q->list, &hb1->chain);
 		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-		q->list.plist.spinlock = &hb2->lock;
-#endif
 	}
 	get_futex_key_refs(key2);
 	q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	get_futex_key_refs(key);
 	q->key = *key;
 
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
 
 	q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 
 	wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 	prio = min(current->normal_prio, MAX_RT_PRIO);
 
 	plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
 	spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
 			spin_unlock(lock_ptr);
 			goto retry;
 		}
-		WARN_ON(plist_node_empty(&q->list));
-		plist_del(&q->list, &q->list.plist);
+		__unqueue_futex(q);
 
 		BUG_ON(q->pi_state);
 
@@ -1525,8 +1525,7 @@ retry:
 static void unqueue_me_pi(struct futex_q *q)
 	__releases(q->lock_ptr)
 {
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
 	free_pi_state(q->pi_state);
@@ -1578,9 +1577,7 @@ retry:
 	while (1) {
 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			goto handle_fault;
 		if (curval == uval)
 			break;
@@ -1781,13 +1778,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 *
 	 * The basic logical guarantee of a futex is that it blocks ONLY
 	 * if cond(var) is known to be true at the time of blocking, for
-	 * any cond.  If we queued after testing *uaddr, that would open
-	 * a race condition where we could block indefinitely with
+	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
+	 * would open a race condition where we could block indefinitely with
 	 * cond(var) false, which would violate the guarantee.
 	 *
-	 * A consequence is that futex_wait() can return zero and absorb
-	 * a wakeup when *uaddr != val on entry to the syscall.  This is
-	 * rare, but normal.
+	 * On the other hand, we insert q and release the hash-bucket only
+	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
+	 * absorb a wakeup if *uaddr does not match the desired values
+	 * while the syscall executes.
 	 */
 retry:
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2044,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	u32 uval;
 	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
+	u32 uval, vpid = task_pid_vnr(current);
 	int ret;
 
 retry:
@@ -2057,7 +2055,7 @@ retry:
 	/*
 	 * We release only a lock we actually own:
 	 */
-	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2070,14 @@ retry:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	if (!(uval & FUTEX_OWNER_DIED))
-		uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
-	if (unlikely(uval == -EFAULT))
+	if (!(uval & FUTEX_OWNER_DIED) &&
+	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
 		goto pi_faulted;
 	/*
 	 * Rare case: we managed to release the lock atomically,
 	 * no need to wake anyone else up:
 	 */
-	if (unlikely(uval == task_pid_vnr(current)))
+	if (unlikely(uval == vpid))
 		goto out_unlock;
 
 	/*
@@ -2167,7 +2162,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 * We were woken prior to requeue by a timeout or a signal.
 		 * Unqueue the futex_q and determine which it was.
 		 */
-		plist_del(&q->list, &q->list.plist);
+		plist_del(&q->list, &hb->chain);
 
 		/* Handle spurious wakeups gracefully */
 		ret = -EWOULDBLOCK;
@@ -2463,11 +2458,20 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
-		if (nval == -EFAULT)
-			return -1;
-
+		/*
+		 * We are not holding a lock here, but we want to have
+		 * the pagefault_disable/enable() protection because
+		 * we want to handle the fault gracefully. If the
+		 * access fails we try to fault in the futex with R/W
+		 * verification via get_user_pages. get_user() above
+		 * does not guarantee R/W access. If that fails we
+		 * give up and leave the futex locked.
+		 */
+		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+			if (fault_in_user_writeable(uaddr))
+				return -1;
+			goto retry;
+		}
 		if (nval != uval)
 			goto retry;
 
@@ -2678,8 +2682,7 @@ static int __init futex_init(void)
 	 * implementation, the non-functional ones will return
 	 * -ENOSYS.
 	 */
-	curval = cmpxchg_futex_value_locked(NULL, 0, 0);
-	if (curval == -EFAULT)
+	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a719012..db864334a95 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 
 #include <trace/events/irq.h>
+#include <trace/irq.h>
 
 #include "internals.h"
 
@@ -51,6 +52,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
+DEFINE_TRACE(irq_entry);
+DEFINE_TRACE(irq_exit);
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -63,6 +67,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
+	trace_irq_entry(irq, NULL, action);
+
 	do {
 		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
@@ -116,5 +122,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		add_interrupt_randomness(irq);
 	local_irq_disable();
 
+	trace_irq_exit(retval);
+
 	return retval;
 }
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bd..1c07afd307f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -109,6 +109,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return radix_tree_lookup(&irq_desc_tree, irq);
 }
+EXPORT_SYMBOL_GPL(irq_to_desc);
 
 static void delete_irq_desc(unsigned int irq)
 {
@@ -273,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
+EXPORT_SYMBOL_GPL(irq_to_desc);
 
 struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153d..18fd8e919c0 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -13,9 +13,13 @@
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
+#include <trace/timer.h>
 
 #include <asm/uaccess.h>
 
+DEFINE_TRACE(timer_itimer_expired);
+DEFINE_TRACE(timer_itimer_set);
+
 /**
  * itimer_get_remtime - get remaining time for the timer
  *
@@ -124,6 +128,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 		container_of(timer, struct signal_struct, real_timer);
 
 	trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
+	trace_timer_itimer_expired(sig);
 	kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
 
 	return HRTIMER_NORESTART;
@@ -201,6 +206,8 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 	    !timeval_valid(&value->it_interval))
 		return -EINVAL;
 
+	trace_timer_itimer_set(which, value);
+
 	switch (which) {
 	case ITIMER_REAL:
 again:
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7eb..779f0031929 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <trace/kernel.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -40,6 +41,9 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 
+DEFINE_TRACE(kernel_kernel_kexec);
+DEFINE_TRACE(kernel_crash_kexec);
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
@@ -1066,6 +1070,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 
 void crash_kexec(struct pt_regs *regs)
 {
+	trace_kernel_crash_kexec(kexec_crash_image, regs);
+
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
 	 * we are using after a panic on a different cpu.
@@ -1495,6 +1501,8 @@ int kernel_kexec(void)
 {
 	int error = 0;
 
+	trace_kernel_kernel_kexec(kexec_image);
+
 	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 	if (!kexec_image) {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f..e0841c537db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,6 +49,8 @@
 
 #include "lockdep_internals.h"
 
+#include <trace/lockdep.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
@@ -66,6 +68,13 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
+DEFINE_TRACE(lockdep_hardirqs_on);
+DEFINE_TRACE(lockdep_hardirqs_off);
+DEFINE_TRACE(lockdep_softirqs_on);
+DEFINE_TRACE(lockdep_softirqs_off);
+DEFINE_TRACE(lockdep_lock_acquire);
+DEFINE_TRACE(lockdep_lock_release);
+
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -2300,6 +2309,8 @@ void trace_hardirqs_on_caller(unsigned long ip)
 
 	time_hardirqs_on(CALLER_ADDR0, ip);
 
+	trace_lockdep_hardirqs_on(ip);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2358,6 +2369,8 @@ void trace_hardirqs_off_caller(unsigned long ip)
 
 	time_hardirqs_off(CALLER_ADDR0, ip);
 
+	trace_lockdep_hardirqs_off(ip);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2390,6 +2403,8 @@ void trace_softirqs_on(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_softirqs_on(ip);
+
 	if (unlikely(!debug_locks))
 		return;
 
@@ -2424,6 +2439,8 @@ void trace_softirqs_off(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_softirqs_off(ip);
+
 	if (unlikely(!debug_locks))
 		return;
 
@@ -2730,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int class_idx;
 	u64 chain_key;
 
+	trace_lockdep_lock_acquire(ip, subclass, lock, trylock, read,
+		hardirqs_off);
+
 	if (!prove_locking)
 		check = 1;
 
@@ -3108,6 +3128,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_lock_release(ip, lock, nested);
+
 	if (!check_unlock(curr, lock, ip))
 		return;
 
diff --git a/kernel/ltt-channels.c b/kernel/ltt-channels.c
new file mode 100644
index 00000000000..102513874ad
--- /dev/null
+++ b/kernel/ltt-channels.c
@@ -0,0 +1,388 @@
+/*
+ * ltt/ltt-channels.c
+ *
+ * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng channel management.
+ *
+ * Author:
+ *	Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/ltt-channels.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * ltt_channel_mutex may be nested inside the LTT trace mutex.
+ * ltt_channel_mutex mutex may be nested inside markers mutex.
+ */
+static DEFINE_MUTEX(ltt_channel_mutex);
+static LIST_HEAD(ltt_channels);
+/*
+ * Index of next channel in array. Makes sure that as long as a trace channel is
+ * allocated, no array index will be re-used when a channel is freed and then
+ * another channel is allocated. This index is cleared and the array indexeds
+ * get reassigned when the index_kref goes back to 0, which indicates that no
+ * more trace channels are allocated.
+ */
+static unsigned int free_index;
+/* index_kref is protected by both ltt_channel_mutex and lock_markers */
+static struct kref index_kref;	/* Keeps track of allocated trace channels */
+
+static struct ltt_channel_setting *lookup_channel(const char *name)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (strcmp(name, iter->name) == 0)
+			return iter;
+	return NULL;
+}
+
+/*
+ * Must be called when channel refcount falls to 0 _and_ also when the last
+ * trace is freed. This function is responsible for compacting the channel and
+ * event IDs when no users are active.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_channel_setting(struct kref *kref)
+{
+	struct ltt_channel_setting *setting = container_of(kref,
+		struct ltt_channel_setting, kref);
+	struct ltt_channel_setting *iter;
+
+	if (atomic_read(&index_kref.refcount) == 0
+	    && atomic_read(&setting->kref.refcount) == 0) {
+		list_del(&setting->list);
+		kfree(setting);
+
+		free_index = 0;
+		list_for_each_entry(iter, &ltt_channels, list) {
+			iter->index = free_index++;
+			iter->free_event_id = 0;
+		}
+	}
+}
+
+/*
+ * Perform channel index compaction when the last trace channel is freed.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_trace_channel(struct kref *kref)
+{
+	struct ltt_channel_setting *iter, *n;
+
+	list_for_each_entry_safe(iter, n, &ltt_channels, list)
+		release_channel_setting(&iter->kref);
+	if (atomic_read(&index_kref.refcount) == 0)
+		markers_compact_event_ids();
+}
+
+/*
+ * ltt_channel_trace_ref :  Is there an existing trace session ?
+ *
+ * Must be called with lock_markers() held.
+ */
+int ltt_channels_trace_ref(void)
+{
+	return !!atomic_read(&index_kref.refcount);
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_ref);
+
+/**
+ * ltt_channels_register - Register a trace channel.
+ * @name: channel name
+ *
+ * Uses refcounting.
+ */
+int ltt_channels_register(const char *name)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (setting) {
+		if (atomic_read(&setting->kref.refcount) == 0)
+			goto init_kref;
+		else {
+			kref_get(&setting->kref);
+			goto end;
+		}
+	}
+	setting = kzalloc(sizeof(*setting), GFP_KERNEL);
+	if (!setting) {
+		ret = -ENOMEM;
+		goto end;
+	}
+	list_add(&setting->list, &ltt_channels);
+	strncpy(setting->name, name, PATH_MAX-1);
+	setting->index = free_index++;
+init_kref:
+	kref_init(&setting->kref);
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_register);
+
+/**
+ * ltt_channels_unregister - Unregister a trace channel.
+ * @name: channel name
+ * @compacting: performing compaction
+ *
+ * Must be called with markers mutex held.
+ */
+int ltt_channels_unregister(const char *name, int compacting)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	if (!compacting)
+		mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+		ret = -ENOENT;
+		goto end;
+	}
+	kref_put(&setting->kref, release_channel_setting);
+	if (!compacting && atomic_read(&index_kref.refcount) == 0)
+			markers_compact_event_ids();
+end:
+	if (!compacting)
+		mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_unregister);
+
+/**
+ * ltt_channels_set_default - Set channel default behavior.
+ * @name: default channel name
+ * @sb_size: size of the subbuffers
+ * @n_sb: number of subbuffers
+ */
+int ltt_channels_set_default(const char *name,
+			     unsigned int sb_size,
+			     unsigned int n_sb)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+		ret = -ENOENT;
+		goto end;
+	}
+	setting->sb_size = sb_size;
+	setting->n_sb = n_sb;
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_set_default);
+
+/**
+ * ltt_channels_get_name_from_index - get channel name from channel index
+ * @index: channel index
+ *
+ * Allows to lookup the channel name given its index. Done to keep the name
+ * information outside of each trace channel instance.
+ */
+const char *ltt_channels_get_name_from_index(unsigned int index)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (iter->index == index && atomic_read(&iter->kref.refcount))
+			return iter->name;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index);
+
+static struct ltt_channel_setting *
+ltt_channels_get_setting_from_name(const char *name)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (!strcmp(iter->name, name)
+		    && atomic_read(&iter->kref.refcount))
+			return iter;
+	return NULL;
+}
+
+/**
+ * ltt_channels_get_index_from_name - get channel index from channel name
+ * @name: channel name
+ *
+ * Allows to lookup the channel index given its name. Done to keep the name
+ * information outside of each trace channel instance.
+ * Returns -1 if not found.
+ */
+int ltt_channels_get_index_from_name(const char *name)
+{
+	struct ltt_channel_setting *setting;
+
+	setting = ltt_channels_get_setting_from_name(name);
+	if (setting)
+		return setting->index;
+	else
+		return -1;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name);
+
+/**
+ * ltt_channels_trace_alloc - Allocate channel structures for a trace
+ * @sb_size: subbuffer size. 0 uses default.
+ * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default.
+ * @flags: Default channel flags
+ *
+ * Use the current channel list to allocate the channels for a trace.
+ * Called with trace lock held. Does not perform the trace buffer allocation,
+ * because we must let the user overwrite specific channel sizes.
+ */
+struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels,
+					  int overwrite, int active)
+{
+	struct ltt_chan *chan = NULL;
+	struct ltt_channel_setting *iter;
+
+	lock_markers();
+	mutex_lock(&ltt_channel_mutex);
+	if (!free_index)
+		goto end;
+	if (!atomic_read(&index_kref.refcount))
+		kref_init(&index_kref);
+	else
+		kref_get(&index_kref);
+	*nr_channels = free_index;
+	chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL);
+	if (!chan)
+		goto end;
+	list_for_each_entry(iter, &ltt_channels, list) {
+		if (!atomic_read(&iter->kref.refcount))
+			continue;
+		chan[iter->index].a.sb_size = iter->sb_size;
+		chan[iter->index].a.n_sb = iter->n_sb;
+		chan[iter->index].overwrite = overwrite;
+		chan[iter->index].active = active;
+		strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1);
+		chan[iter->index].switch_timer_interval = 0;
+	}
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	unlock_markers();
+	return chan;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc);
+
+/**
+ * ltt_channels_trace_free - Free one trace's channels
+ * @channels: channels to free
+ *
+ * Called with trace lock held. The actual channel buffers must be freed before
+ * this function is called.
+ */
+void ltt_channels_trace_free(struct ltt_chan *channels,
+			     unsigned int nr_channels)
+{
+	lock_markers();
+	mutex_lock(&ltt_channel_mutex);
+	kfree(channels);
+	kref_put(&index_kref, release_trace_channel);
+	mutex_unlock(&ltt_channel_mutex);
+	unlock_markers();
+	marker_update_probes();
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_free);
+
+/**
+ * ltt_channels_trace_set_timer - set switch timer
+ * @channel: channel
+ * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer.
+ */
+
+void ltt_channels_trace_set_timer(struct ltt_chan *chan,
+				  unsigned long interval)
+{
+	chan->switch_timer_interval = interval;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer);
+
+/**
+ * _ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ * Must be called with channels mutex held.
+ */
+int _ltt_channels_get_event_id(const char *channel, const char *name)
+{
+	struct ltt_channel_setting *setting;
+	int ret;
+
+	setting = ltt_channels_get_setting_from_name(channel);
+	if (!setting) {
+		ret = -ENOENT;
+		goto end;
+	}
+	if (strcmp(channel, "metadata") == 0) {
+		if (strcmp(name, "core_marker_id") == 0)
+			ret = 0;
+		else if (strcmp(name, "core_marker_format") == 0)
+			ret = 1;
+		else
+			ret = -ENOENT;
+		goto end;
+	}
+	if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) {
+		ret = -ENOSPC;
+		goto end;
+	}
+	ret = setting->free_event_id++;
+end:
+	return ret;
+}
+
+/**
+ * ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ */
+int ltt_channels_get_event_id(const char *channel, const char *name)
+{
+	int ret;
+
+	mutex_lock(&ltt_channel_mutex);
+	ret = _ltt_channels_get_event_id(channel, name);
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+
+/**
+ * ltt_channels_reset_event_ids - reset event IDs at compaction
+ *
+ * Called with lock marker and channel mutex held.
+ */
+void _ltt_channels_reset_event_ids(void)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		iter->free_event_id = 0;
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management");
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 00000000000..eac8ebfc3b9
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,1262 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/immediate.h>
+#include <linux/ltt-channels.h>
+
+extern struct marker __start___markers[];
+extern struct marker __stop___markers[];
+
+/* Set to 1 to enable marker debug output */
+static const int marker_debug;
+
+/*
+ * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
+ * and module markers and the hash table.
+ * markers_mutex nests inside the trace lock, to ensure event ID consistency
+ * between the hash table and the marker section.
+ */
+static DEFINE_MUTEX(markers_mutex);
+
+void lock_markers(void)
+{
+	mutex_lock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_markers);
+
+void unlock_markers(void)
+{
+	mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_markers);
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+static struct hlist_head id_table[MARKER_TABLE_SIZE];
+
+struct marker_probe_array {
+	struct rcu_head rcu;
+	struct marker_probe_closure c[0];
+};
+
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
+struct marker_entry {
+	struct hlist_node hlist;
+	struct hlist_node id_list;
+	char *format;
+	char *name;
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
+	struct marker_probe_closure single;
+	struct marker_probe_array *multi;
+	int refcount;	/* Number of times armed. 0 if disarmed. */
+	u16 channel_id;
+	u16 event_id;
+	unsigned char ptype:1;
+	unsigned char format_allocated:1;
+	char channel[0];	/* Contains channel'\0'name'\0'format'\0' */
+};
+
+/**
+ * __mark_empty_function - Empty probe callback
+ * @mdata: marker data
+ * @probe_private: probe private data
+ * @call_private: call site private data
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we make sure the  execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+notrace void __mark_empty_function(const struct marker *mdata,
+	void *probe_private, void *call_private, const char *fmt, va_list *args)
+{
+}
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+notrace void marker_probe_cb(const struct marker *mdata,
+		void *call_private, ...)
+{
+	va_list args;
+	char ptype;
+
+	/*
+	 * rcu_read_lock_sched does two things : disabling preemption to make
+	 * sure the teardown of the callbacks can be done correctly when they
+	 * are in modules and they insure RCU read coherency.
+	 */
+	rcu_read_lock_sched_notrace();
+	ptype = mdata->ptype;
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = mdata->single.func;
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		va_start(args, call_private);
+		func(mdata, mdata->single.probe_private, call_private,
+			mdata->format, &args);
+		va_end(args);
+	} else {
+		struct marker_probe_array *multi;
+		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		for (i = 0; multi->c[i].func; i++) {
+			va_start(args, call_private);
+			multi->c[i].func(mdata, multi->c[i].probe_private,
+				call_private, mdata->format, &args);
+			va_end(args);
+		}
+	}
+	rcu_read_unlock_sched_notrace();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+		void *call_private, ...)
+{
+	va_list args;	/* not initialized */
+	char ptype;
+
+	rcu_read_lock_sched_notrace();
+	ptype = mdata->ptype;
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = mdata->single.func;
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func(mdata, mdata->single.probe_private, call_private,
+			mdata->format, &args);
+	} else {
+		struct marker_probe_array *multi;
+		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		for (i = 0; multi->c[i].func; i++)
+			multi->c[i].func(mdata, multi->c[i].probe_private,
+				call_private, mdata->format, &args);
+	}
+	rcu_read_unlock_sched_notrace();
+}
+
+static void free_old_closure(struct rcu_head *head)
+{
+	struct marker_probe_array *multi = container_of(head, struct marker_probe_array, rcu);
+	kfree(multi);
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+	int i;
+
+	if (!marker_debug)
+		return;
+
+	if (!entry->ptype) {
+		printk(KERN_DEBUG "Single probe : %p %p\n",
+			entry->single.func,
+			entry->single.probe_private);
+	} else {
+		for (i = 0; entry->multi->c[i].func; i++)
+			printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+				entry->multi->c[i].func,
+				entry->multi->c[i].probe_private);
+	}
+}
+
+static struct marker_probe_array *
+marker_entry_add_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0;
+	struct marker_probe_array *old, *new;
+
+	WARN_ON(!probe);
+
+	debug_print_probes(entry);
+	old = entry->multi;
+	if (!entry->ptype) {
+		if (entry->single.func == probe &&
+				entry->single.probe_private == probe_private)
+			return ERR_PTR(-EBUSY);
+		if (entry->single.func == __mark_empty_function) {
+			/* 0 -> 1 probes */
+			entry->single.func = probe;
+			entry->single.probe_private = probe_private;
+			entry->refcount = 1;
+			entry->ptype = 0;
+			debug_print_probes(entry);
+			return NULL;
+		} else {
+			/* 1 -> 2 probes */
+			nr_probes = 1;
+			old = NULL;
+		}
+	} else {
+		/* (N -> N+1), (N != 0, 1) probes */
+		for (nr_probes = 0; old->c[nr_probes].func; nr_probes++)
+			if (old->c[nr_probes].func == probe
+					&& old->c[nr_probes].probe_private
+						== probe_private)
+				return ERR_PTR(-EBUSY);
+	}
+	/* + 2 : one for new probe, one for NULL func */
+	new = kzalloc(sizeof(struct marker_probe_array)
+		      + ((nr_probes + 2) * sizeof(struct marker_probe_closure)),
+			GFP_KERNEL);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (!old)
+		new->c[0] = entry->single;
+	else
+		memcpy(&new->c[0], &old->c[0],
+			nr_probes * sizeof(struct marker_probe_closure));
+	new->c[nr_probes].func = probe;
+	new->c[nr_probes].probe_private = probe_private;
+	entry->refcount = nr_probes + 1;
+	entry->multi = new;
+	entry->ptype = 1;
+	debug_print_probes(entry);
+	return old;
+}
+
+static struct marker_probe_array *
+marker_entry_remove_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0, nr_del = 0, i;
+	struct marker_probe_array *old, *new;
+
+	old = entry->multi;
+
+	debug_print_probes(entry);
+	if (!entry->ptype) {
+		/* 0 -> N is an error */
+		WARN_ON(entry->single.func == __mark_empty_function);
+		/* 1 -> 0 probes */
+		WARN_ON(probe && entry->single.func != probe);
+		WARN_ON(entry->single.probe_private != probe_private);
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+		debug_print_probes(entry);
+		return NULL;
+	} else {
+		/* (N -> M), (N > 1, M >= 0) probes */
+		for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) {
+			if ((!probe || old->c[nr_probes].func == probe)
+					&& old->c[nr_probes].probe_private
+						== probe_private)
+				nr_del++;
+		}
+	}
+
+	if (nr_probes - nr_del == 0) {
+		/* N -> 0, (N > 1) */
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+	} else if (nr_probes - nr_del == 1) {
+		/* N -> 1, (N > 1) */
+		for (i = 0; old->c[i].func; i++)
+			if ((probe && old->c[i].func != probe) ||
+			    old->c[i].probe_private != probe_private)
+				entry->single = old->c[i];
+		entry->refcount = 1;
+		entry->ptype = 0;
+	} else {
+		int j = 0;
+		/* N -> M, (N > 1, M > 1) */
+		/* + 1 for NULL */
+		new = kzalloc(sizeof(struct marker_probe_array)
+			      + ((nr_probes - nr_del + 1)
+			         * sizeof(struct marker_probe_closure)),
+			      GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		for (i = 0; old->c[i].func; i++)
+			if ((probe && old->c[i].func != probe) ||
+			    old->c[i].probe_private != probe_private)
+				new->c[j++] = old->c[i];
+		entry->refcount = nr_probes - nr_del;
+		entry->ptype = 1;
+		entry->multi = new;
+	}
+	debug_print_probes(entry);
+	return old;
+}
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *get_marker(const char *channel, const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name))
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static struct marker_entry *add_marker(const char *channel, const char *name,
+		const char *format)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	size_t format_len = 0;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	if (format)
+		format_len = strlen(format) + 1;
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			printk(KERN_NOTICE
+				"Marker %s.%s busy\n", channel, name);
+			return ERR_PTR(-EBUSY);	/* Already there */
+		}
+	}
+	/*
+	 * Using kmalloc here to allocate a variable length element. Could
+	 * cause some memory fragmentation if overused.
+	 */
+	e = kmalloc(sizeof(struct marker_entry)
+		    + channel_len + name_len + format_len,
+		    GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+	memcpy(e->channel, channel, channel_len);
+	e->name = &e->channel[channel_len];
+	memcpy(e->name, name, name_len);
+	if (format) {
+		e->format = &e->name[name_len];
+		memcpy(e->format, format, format_len);
+		if (strcmp(e->format, MARK_NOARGS) == 0)
+			e->call = marker_probe_cb_noarg;
+		else
+			e->call = marker_probe_cb;
+		trace_mark(metadata, core_marker_format,
+			   "channel %s name %s format %s",
+			   e->channel, e->name, e->format);
+	} else {
+		e->format = NULL;
+		e->call = marker_probe_cb;
+	}
+	e->single.func = __mark_empty_function;
+	e->single.probe_private = NULL;
+	e->multi = NULL;
+	e->ptype = 0;
+	e->format_allocated = 0;
+	e->refcount = 0;
+	hlist_add_head(&e->hlist, head);
+	return e;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held. Parameter "registered" indicates if the channel registration has been
+ * performed.
+ */
+static int remove_marker(const char *channel, const char *name, int registered,
+			 int compacting)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	int found = 0;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	u32 hash;
+	int ret;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+	if (e->single.func != __mark_empty_function)
+		return -EBUSY;
+
+	if (registered && ltt_channels_trace_ref())
+		return 0;
+
+	hlist_del(&e->hlist);
+	hlist_del(&e->id_list);
+	if (registered) {
+		ret = ltt_channels_unregister(e->channel, compacting);
+		WARN_ON(ret);
+	}
+	if (e->format_allocated)
+		kfree(e->format);
+	kfree(e);
+	return 0;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int marker_set_format(struct marker_entry *entry, const char *format)
+{
+	entry->format = kstrdup(format, GFP_KERNEL);
+	if (!entry->format)
+		return -ENOMEM;
+	entry->format_allocated = 1;
+
+	trace_mark(metadata, core_marker_format,
+		   "channel %s name %s format %s",
+		   entry->channel, entry->name, entry->format);
+	return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one marker.
+ */
+static int set_marker(struct marker_entry *entry, struct marker *elem,
+		int active)
+{
+	int ret = 0;
+	WARN_ON(strcmp(entry->name, elem->name) != 0);
+
+	if (entry->format) {
+		if (strcmp(entry->format, elem->format) != 0) {
+			printk(KERN_NOTICE
+				"Format mismatch for probe %s "
+				"(%s), marker (%s)\n",
+				entry->name,
+				entry->format,
+				elem->format);
+			return -EPERM;
+		}
+	} else {
+		ret = marker_set_format(entry, elem->format);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * probe_cb setup (statically known) is done here. It is
+	 * asynchronous with the rest of execution, therefore we only
+	 * pass from a "safe" callback (with argument) to an "unsafe"
+	 * callback (does not set arguments).
+	 */
+	elem->call = entry->call;
+	elem->channel_id = entry->channel_id;
+	elem->event_id = entry->event_id;
+	/*
+	 * Sanity check :
+	 * We only update the single probe private data when the ptr is
+	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+	 */
+	WARN_ON(elem->single.func != __mark_empty_function
+		&& elem->single.probe_private != entry->single.probe_private
+		&& !elem->ptype);
+	elem->single.probe_private = entry->single.probe_private;
+	/*
+	 * Make sure the private data is valid when we update the
+	 * single probe ptr.
+	 */
+	smp_wmb();
+	elem->single.func = entry->single.func;
+	/*
+	 * We also make sure that the new probe callbacks array is consistent
+	 * before setting a pointer to it.
+	 */
+	rcu_assign_pointer(elem->multi, entry->multi);
+	/*
+	 * Update the function or multi probe array pointer before setting the
+	 * ptype.
+	 */
+	smp_wmb();
+	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ _imv_read(elem->state))) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb, NULL);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb, NULL);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
+	elem->state__imv = active;
+
+	return ret;
+}
+
+/*
+ * Disable a marker and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by rcu_read_lock_sched around the call site.
+ */
+static void disable_marker(struct marker *elem)
+{
+	int ret;
+
+	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && _imv_read(elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb, NULL);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
+	elem->state__imv = 0;
+	elem->single.func = __mark_empty_function;
+	/* Update the function before setting the ptype */
+	smp_wmb();
+	elem->ptype = 0;	/* single probe */
+	/*
+	 * Leave the private data and channel_id/event_id there, because removal
+	 * is racy and should be done only after an RCU period. These are never
+	 * used until the next initialization anyway.
+	 */
+}
+
+/*
+ * is_marker_present - Check if a marker is present in kernel.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * We cannot take the marker lock around calls to this function because it needs
+ * to take the module mutex within the iterator. Marker mutex nests inside
+ * module mutex.
+ * Returns 1 if the marker is present, 0 if not.
+ */
+int is_marker_present(const char *channel, const char *name)
+{
+	int ret;
+	struct marker_iter iter;
+
+	ret = 0;
+
+	marker_iter_reset(&iter);
+	marker_iter_start(&iter);
+	for (; iter.marker != NULL; marker_iter_next(&iter)) {
+		if (!strcmp(iter.marker->channel, channel) &&
+		    !strcmp(iter.marker->name, name)) {
+			ret = 1;
+			goto end;
+		}
+	}
+end:
+	marker_iter_stop(&iter);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_present);
+
+/*
+ * _is_marker_enabled - Check if a marker is enabled, must be called with
+ *                      markers_mutex held.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int _is_marker_enabled(const char *channel, const char *name)
+{
+	struct marker_entry *entry;
+
+	entry = get_marker(channel, name);
+
+	return entry && !!entry->refcount;
+}
+EXPORT_SYMBOL_GPL(_is_marker_enabled);
+
+/*
+ * is_marker_enabled - the wrapper of _is_marker_enabled
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int is_marker_enabled(const char *channel, const char *name)
+{
+	int ret;
+
+	lock_markers();
+	ret = _is_marker_enabled(channel, name);
+	unlock_markers();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_enabled);
+
+/**
+ * marker_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of markers.
+ */
+void marker_update_probe_range(struct marker *begin,
+	struct marker *end)
+{
+	struct marker *iter;
+	struct marker_entry *mark_entry;
+
+	mutex_lock(&markers_mutex);
+	for (iter = begin; iter < end; iter++) {
+		mark_entry = get_marker(iter->channel, iter->name);
+		if (mark_entry) {
+			set_marker(mark_entry, iter, !!mark_entry->refcount);
+			/*
+			 * ignore error, continue
+			 */
+		} else {
+			disable_marker(iter);
+		}
+	}
+	mutex_unlock(&markers_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
+ */
+void marker_update_probes(void)
+{
+	/* Core kernel markers */
+	marker_update_probe_range(__start___markers, __stop___markers);
+	/* Markers in modules. */
+	module_update_markers();
+	tracepoint_probe_update_all();
+	/* Update immediate values */
+	core_imv_update();
+	module_imv_update();
+}
+
+/**
+ * marker_probe_register -  Connect a probe to a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @format: format string
+ * @probe: probe handler
+ * @probe_private: probe private data
+ *
+ * private data must be a valid allocated memory address, or NULL.
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int marker_probe_register(const char *channel, const char *name,
+			  const char *format, marker_probe_func *probe,
+			  void *probe_private)
+{
+	struct marker_entry *entry;
+	int ret = 0, ret_err;
+	struct marker_probe_array *old;
+	int first_probe = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker(channel, name);
+	if (!entry) {
+		first_probe = 1;
+		entry = add_marker(channel, name, format);
+		if (IS_ERR(entry))
+			ret = PTR_ERR(entry);
+		if (ret)
+			goto end;
+		ret = ltt_channels_register(channel);
+		if (ret)
+			goto error_remove_marker;
+		ret = ltt_channels_get_index_from_name(channel);
+		if (ret < 0)
+			goto error_unregister_channel;
+		entry->channel_id = ret;
+		ret = ltt_channels_get_event_id(channel, name);
+		if (ret < 0)
+			goto error_unregister_channel;
+		entry->event_id = ret;
+		hlist_add_head(&entry->id_list, id_table + hash_32(
+				(entry->channel_id << 16) | entry->event_id,
+				MARKER_HASH_BITS));
+		ret = 0;
+		trace_mark(metadata, core_marker_id,
+			   "channel %s name %s event_id %hu "
+			   "int #1u%zu long #1u%zu pointer #1u%zu "
+			   "size_t #1u%zu alignment #1u%u",
+			   channel, name, entry->event_id,
+			   sizeof(int), sizeof(long), sizeof(void *),
+			   sizeof(size_t), ltt_get_alignment());
+	} else if (format) {
+		if (!entry->format)
+			ret = marker_set_format(entry, format);
+		else if (strcmp(entry->format, format))
+			ret = -EPERM;
+		if (ret)
+			goto end;
+	}
+
+	old = marker_entry_add_probe(entry, probe, probe_private);
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
+		if (first_probe)
+			goto error_unregister_channel;
+		else
+			goto end;
+	}
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	return ret;
+
+error_unregister_channel:
+	ret_err = ltt_channels_unregister(channel, 1);
+	WARN_ON(ret_err);
+error_remove_marker:
+	ret_err = remove_marker(channel, name, 0, 0);
+	WARN_ON(ret_err);
+end:
+	mutex_unlock(&markers_mutex);
+	marker_update_probes();	/* for compaction on error path */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_register);
+
+/**
+ * marker_probe_unregister -  Disconnect a probe from a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
+ *
+ * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister(const char *channel, const char *name,
+			    marker_probe_func *probe, void *probe_private)
+{
+	struct marker_entry *entry;
+	struct marker_probe_array *old;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker(channel, name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	}
+	old = marker_entry_remove_probe(entry, probe, probe_private);
+	remove_marker(channel, name, 1, 0);	/* Ignore busy error message */
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	return ret;
+
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
+{
+	struct marker_entry *entry;
+	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			if (!entry->ptype) {
+				if (entry->single.func == probe
+						&& entry->single.probe_private
+						== probe_private)
+					return entry;
+			} else {
+				struct marker_probe_array *closure;
+				closure = entry->multi;
+				for (i = 0; closure->c[i].func; i++) {
+					if (closure->c[i].func == probe &&
+					    closure->c[i].probe_private
+					    == probe_private)
+						return entry;
+				}
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
+ *
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+		void *probe_private)
+{
+	struct marker_entry *entry;
+	int ret = 0;
+	struct marker_probe_array *old;
+	const char *channel = NULL, *name = NULL;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker_from_private_data(probe, probe_private);
+	if (!entry) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	old = marker_entry_remove_probe(entry, NULL, probe_private);
+	channel = kstrdup(entry->channel, GFP_KERNEL);
+	name = kstrdup(entry->name, GFP_KERNEL);
+	remove_marker(channel, name, 1, 0);	/* Ignore busy error message */
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	goto end;
+
+unlock:
+	mutex_unlock(&markers_mutex);
+end:
+	kfree(channel);
+	kfree(name);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
+
+/**
+ * marker_get_private_data - Get a marker's probe private data
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
+ *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
+ * Returns the private data pointer, or an ERR_PTR.
+ * The private data pointer should _only_ be dereferenced if the caller is the
+ * owner of the data, or its content could vanish. This is mostly used to
+ * confirm that a caller is the owner of a registered probe.
+ */
+void *marker_get_private_data(const char *channel, const char *name,
+			      marker_probe_func *probe, int num)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	int i;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			if (!e->ptype) {
+				if (num == 0 && e->single.func == probe)
+					return e->single.probe_private;
+			} else {
+				struct marker_probe_array *closure;
+				int match = 0;
+				closure = e->multi;
+				for (i = 0; closure->c[i].func; i++) {
+					if (closure->c[i].func != probe)
+						continue;
+					if (match++ == num)
+						return closure->c[i].probe_private;
+				}
+			}
+			break;
+		}
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+static struct marker_entry *get_entry_from_id(u16 channel_id, u16 event_id)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e, *found = NULL;
+	u32 hash = hash_32((channel_id << 16) | event_id, MARKER_HASH_BITS);
+
+	mutex_lock(&markers_mutex);
+	head = id_table + hash;
+	hlist_for_each_entry(e, node, head, id_list) {
+		if (e->channel_id == channel_id && e->event_id == event_id) {
+			found = e;
+			break;
+		}
+	}
+	mutex_unlock(&markers_mutex);
+	return found;
+}
+
+/* must call when ids/marker_entry are kept alive */
+const char *marker_get_name_from_id(u16 channel_id, u16 event_id)
+{
+	struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+	return e ? e->name : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_name_from_id);
+
+const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id)
+{
+	struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+	return e ? e->format : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_fmt_from_id);
+
+/**
+ * markers_compact_event_ids - Compact markers event IDs and reassign channels
+ *
+ * Called when no channel users are active by the channel infrastructure.
+ * Called with trace lock, lock_markers() and channel mutex held.
+ *
+ * marker_update_probes() must be executed after compaction before releasing the
+ * trace lock.
+ */
+void markers_compact_event_ids(void)
+{
+	struct marker_entry *entry;
+	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node, *next;
+	int ret;
+
+	_ltt_channels_reset_event_ids();
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry_safe(entry, node, next, head, hlist) {
+			if (!entry->refcount) {
+				remove_marker(entry->channel, entry->name,
+					      1, 1);
+				continue;
+			}
+			ret = ltt_channels_get_index_from_name(entry->channel);
+			WARN_ON(ret < 0);
+			entry->channel_id = ret;
+			ret = _ltt_channels_get_event_id(entry->channel,
+							 entry->name);
+			WARN_ON(ret < 0);
+			entry->event_id = ret;
+		}
+	}
+
+	memset(id_table, 0, sizeof(id_table));
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			hlist_add_head(&entry->id_list, id_table + hash_32(
+					(entry->channel_id << 16)
+					| entry->event_id, MARKER_HASH_BITS));
+		}
+	}
+}
+
+#ifdef CONFIG_MODULES
+
+/**
+ * marker_get_iter_range - Get a next marker iterator given a range.
+ * @marker: current markers (in), next marker (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next marker has been found (1) or not (0).
+ * Will return the first marker in the range if the input marker is NULL.
+ */
+int marker_get_iter_range(struct marker **marker, struct marker *begin,
+	struct marker *end)
+{
+	if (!*marker && begin != end) {
+		*marker = begin;
+		return 1;
+	}
+	if (*marker >= begin && *marker < end)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(marker_get_iter_range);
+
+static void marker_get_iter(struct marker_iter *iter)
+{
+	int found = 0;
+
+	/* Core kernel markers */
+	if (!iter->module) {
+		found = marker_get_iter_range(&iter->marker,
+				__start___markers, __stop___markers);
+		if (found)
+			goto end;
+	}
+	/* Markers in modules. */
+	found = module_get_iter_markers(iter);
+end:
+	if (!found)
+		marker_iter_reset(iter);
+}
+
+void marker_iter_start(struct marker_iter *iter)
+{
+	marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_start);
+
+void marker_iter_next(struct marker_iter *iter)
+{
+	iter->marker++;
+	/*
+	 * iter->marker may be invalid because we blindly incremented it.
+	 * Make sure it is valid by marshalling on the markers, getting the
+	 * markers from following modules if necessary.
+	 */
+	marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_next);
+
+void marker_iter_stop(struct marker_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(marker_iter_stop);
+
+void marker_iter_reset(struct marker_iter *iter)
+{
+	iter->module = NULL;
+	iter->marker = NULL;
+}
+EXPORT_SYMBOL_GPL(marker_iter_reset);
+
+int marker_module_notify(struct notifier_block *self,
+			 unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	case MODULE_STATE_GOING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block marker_module_nb = {
+	.notifier_call = marker_module_notify,
+	.priority = 0,
+};
+
+static int init_markers(void)
+{
+	return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
+
+void ltt_dump_marker_state(struct ltt_trace *trace)
+{
+	struct marker_entry *entry;
+	struct ltt_probe_private_data call_data;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned int i;
+
+	mutex_lock(&markers_mutex);
+	call_data.trace = trace;
+	call_data.serializer = NULL;
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			__trace_mark(0, metadata, core_marker_id,
+				&call_data,
+				"channel %s name %s event_id %hu "
+				"int #1u%zu long #1u%zu pointer #1u%zu "
+				"size_t #1u%zu alignment #1u%u",
+				entry->channel,
+				entry->name,
+				entry->event_id,
+				sizeof(int), sizeof(long),
+				sizeof(void *), sizeof(size_t),
+				ltt_get_alignment());
+			if (entry->format)
+				__trace_mark(0, metadata,
+					core_marker_format,
+					&call_data,
+					"channel %s name %s format %s",
+					entry->channel,
+					entry->name,
+					entry->format);
+		}
+	}
+	mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_dump_marker_state);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94b..2767c8eaf12 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
+#include <trace/kernel.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -99,7 +100,9 @@
  * 1) List of modules (also safely readable with preempt_disable),
  * 2) module_use links,
  * 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete uses stop_machine/add uses RCU list operations).
+ * Sorted by ascending list node address.
+ */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
@@ -120,6 +123,9 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
  * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
+DEFINE_TRACE(kernel_module_load);
+DEFINE_TRACE(kernel_module_free);
+
 int register_module_notifier(struct notifier_block * nb)
 {
 	return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -1675,6 +1681,7 @@ static inline void unset_section_ro_nx(struct module *mod, void *module_region)
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
+	trace_kernel_module_free(mod);
 	trace_module_free(mod);
 
 	/* Delete from various lists */
@@ -2272,6 +2279,12 @@ static int copy_and_check(struct load_info *info,
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return -ENOMEM;
 
+	/*
+	 * Make sure the module text or data access never generates any page
+	 * fault.
+	 */
+	vmalloc_sync_all();
+
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2459,6 +2472,10 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 				  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
 
+#ifdef CONFIG_MARKERS
+	mod->markers = section_objs(info, "__markers",
+				    sizeof(*mod->markers), &mod->num_markers);
+#endif
 #ifdef CONFIG_TRACEPOINTS
 	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
 					     sizeof(*mod->tracepoints_ptrs),
@@ -2717,7 +2734,7 @@ static struct module *load_module(void __user *umod,
 				  const char __user *uargs)
 {
 	struct load_info info = { NULL, };
-	struct module *mod;
+	struct module *mod, *iter;
 	long err;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2799,7 +2816,23 @@ static struct module *load_module(void __user *umod,
 		goto ddebug;
 
 	module_bug_finalize(info.hdr, info.sechdrs, mod);
+	/*
+	 * We sort the modules by struct module pointer address to permit
+	 * correct iteration over modules of, at least, kallsyms for preemptible
+	 * operations, such as read(). Sorting by struct module pointer address
+	 * is equivalent to sort by list node address.
+	 */
+	list_for_each_entry_reverse(iter, &modules, list) {
+		BUG_ON(iter == mod);	/* Should never be in the list twice */
+		if (iter < mod) {
+			/* We belong to the location right after iter. */
+			list_add_rcu(&mod->list, &iter->list);
+			goto module_added;
+		}
+	}
+	/* We should be added at the head of the list */
 	list_add_rcu(&mod->list, &modules);
+module_added:
 	mutex_unlock(&module_mutex);
 
 	/* Module is ready to execute: parsing args may do that. */
@@ -2817,6 +2850,7 @@ static struct module *load_module(void __user *umod,
 	free_copy(&info);
 
 	/* Done! */
+	trace_kernel_module_load(mod);
 	trace_module_load(mod);
 	return mod;
 
@@ -3196,12 +3230,12 @@ static char *module_flags(struct module *mod, char *buf)
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&module_mutex);
-	return seq_list_start(&modules, *pos);
+	return seq_sorted_list_start(&modules, pos);
 }
 
 static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &modules, pos);
+	return seq_sorted_list_next(p, &modules, pos);
 }
 
 static void m_stop(struct seq_file *m, void *p)
@@ -3266,6 +3300,27 @@ static int __init proc_modules_init(void)
 module_init(proc_modules_init);
 #endif
 
+void list_modules(void *call_data)
+{
+	/* Enumerate loaded modules */
+	struct list_head	*i;
+	struct module		*mod;
+	unsigned long refcount = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each(i, &modules) {
+		mod = list_entry(i, struct module, list);
+#ifdef CONFIG_MODULE_UNLOAD
+		refcount = module_refcount(mod);
+#endif
+		__trace_mark(0, module_state, list_module, call_data,
+				"name %s state %d refcount %lu",
+				mod->name, mod->state, refcount);
+	}
+	mutex_unlock(&module_mutex);
+}
+EXPORT_SYMBOL_GPL(list_modules);
+
 /* Given an address, look for it in the module exception tables. */
 const struct exception_table_entry *search_module_extables(unsigned long addr)
 {
@@ -3393,12 +3448,59 @@ void module_layout(struct module *mod,
 		   struct modversion_info *ver,
 		   struct kernel_param *kp,
 		   struct kernel_symbol *ks,
+		   struct marker *marker,
 		   struct tracepoint * const *tp)
 {
 }
 EXPORT_SYMBOL(module_layout);
 #endif
 
+#ifdef CONFIG_MARKERS
+void module_update_markers(void)
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list)
+		if (!(mod->taints & TAINT_FORCED_MODULE))
+			marker_update_probe_range(mod->markers,
+				mod->markers + mod->num_markers);
+	mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_markers(struct marker_iter *iter)
+{
+	struct module *iter_mod;
+	int found = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(iter_mod, &modules, list) {
+		if (!(iter_mod->taints & TAINT_FORCED_MODULE)) {
+			/*
+			 * Sorted module list
+			 */
+			if (iter_mod < iter->module)
+				continue;
+			else if (iter_mod > iter->module)
+				iter->marker = NULL;
+			found = marker_get_iter_range(&iter->marker,
+				iter_mod->markers,
+				iter_mod->markers + iter_mod->num_markers);
+			if (found) {
+				iter->module = iter_mod;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&module_mutex);
+	return found;
+}
+#endif
+
 #ifdef CONFIG_TRACEPOINTS
 void module_update_tracepoints(void)
 {
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb56..e8481427153 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,6 +5,7 @@
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
 #include <linux/reboot.h>
+#include <linux/idle.h>
 
 /*
  *	Notifier list for kernel code which wants to be called
@@ -148,7 +149,7 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 	spin_lock_irqsave(&nh->lock, flags);
 	ret = notifier_chain_unregister(&nh->head, n);
 	spin_unlock_irqrestore(&nh->lock, flags);
-	synchronize_rcu();
+	synchronize_sched();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
@@ -178,9 +179,9 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 {
 	int ret;
 
-	rcu_read_lock();
+	rcu_read_lock_sched_notrace();
 	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-	rcu_read_unlock();
+	rcu_read_unlock_sched_notrace();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
@@ -584,3 +585,27 @@ int unregister_die_notifier(struct notifier_block *nb)
 	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+/*
+ * Trace last event before calling notifiers. Notifiers flush data from buffers
+ * before going to idle.
+ */
+int notrace notify_idle(enum idle_val val)
+{
+	return atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(notify_idle);
+
+void register_idle_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(register_idle_notifier);
+
+void unregister_idle_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(unregister_idle_notifier);
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a170..3fd05f5708c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,9 @@
 #include <linux/init.h>
 #include <linux/nmi.h>
 #include <linux/dmi.h>
+#include <trace/kernel.h>
+
+DEFINE_TRACE(kernel_panic);
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -64,6 +67,10 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i, i_next = 0;
 	int state = 0;
 
+	va_start(args, fmt);
+	trace_kernel_panic(fmt, args);
+	va_end(args);
+
 	/*
 	 * It's possible to come here directly from a panic-assertion and
 	 * not have preempt disabled. Some functions called from here want
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22..99373ab79a4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -40,6 +40,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/rculist.h>
+#include <trace/kernel.h>
 
 #include <asm/uaccess.h>
 
@@ -67,6 +68,7 @@ int console_printk[4] = {
 	MINIMUM_CONSOLE_LOGLEVEL,	/* minimum_console_loglevel */
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
+EXPORT_SYMBOL_GPL(console_printk);
 
 /*
  * Low level drivers may need that to know if they can schedule in
@@ -136,6 +138,9 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
 
+DEFINE_TRACE(kernel_printk);
+DEFINE_TRACE(kernel_vprintk);
+
 #ifdef CONFIG_PRINTK
 
 static char __log_buf[__LOG_BUF_LEN];
@@ -650,6 +655,7 @@ asmlinkage int printk(const char *fmt, ...)
 	}
 #endif
 	va_start(args, fmt);
+	trace_kernel_printk(_RET_IP_);
 	r = vprintk(fmt, args);
 	va_end(args);
 
@@ -773,6 +779,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	printed_len += vscnprintf(printk_buf + printed_len,
 				  sizeof(printk_buf) - printed_len, fmt, args);
 
+	trace_kernel_vprintk(_RET_IP_, printk_buf, printed_len);
 
 	p = printk_buf;
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8..a86e46b6bc1 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,7 @@
 #include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <trace/rcu.h>
 
 #include "rcutree.h"
 
@@ -145,6 +146,10 @@ int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
+DEFINE_TRACE(rcu_tree_call_rcu);
+DEFINE_TRACE(rcu_tree_call_rcu_bh);
+DEFINE_TRACE(rcu_tree_callback);
+
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
 
@@ -1143,6 +1148,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
+		trace_rcu_tree_callback(list);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
@@ -1488,6 +1494,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
  */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
+	trace_rcu_tree_call_rcu_bh(head, _RET_IP_);
 	__call_rcu(head, func, &rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
diff --git a/kernel/sched.c b/kernel/sched.c
index c164920c8ce..936fd7bc449 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9255,3 +9255,57 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
 
+static DEFINE_MUTEX(kernel_trace_mutex);
+static int kernel_trace_refcount;
+
+/**
+ * clear_kernel_trace_flag_all_tasks - clears all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to clear their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish clearing
+ * the thread flags, all threads have their flags cleared.
+ */
+void clear_kernel_trace_flag_all_tasks(void)
+{
+	struct task_struct *p;
+	struct task_struct *t;
+
+	mutex_lock(&kernel_trace_mutex);
+	if (--kernel_trace_refcount)
+		goto end;
+	read_lock(&tasklist_lock);
+	do_each_thread(p, t) {
+		clear_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+	} while_each_thread(p, t);
+	read_unlock(&tasklist_lock);
+end:
+	mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(clear_kernel_trace_flag_all_tasks);
+
+/**
+ * set_kernel_trace_flag_all_tasks - sets all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to set their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish setting
+ * the thread flags, all threads have their flags set.
+ */
+void set_kernel_trace_flag_all_tasks(void)
+{
+	struct task_struct *p;
+	struct task_struct *t;
+
+	mutex_lock(&kernel_trace_mutex);
+	if (kernel_trace_refcount++)
+		goto end;
+	read_lock(&tasklist_lock);
+	do_each_thread(p, t) {
+		set_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+	} while_each_thread(p, t);
+	read_unlock(&tasklist_lock);
+end:
+	mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(set_kernel_trace_flag_all_tasks);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec38..a25bf611d13 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,7 +23,10 @@
 #include <linux/rcupdate.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
+#include <linux/marker.h>
+#include <linux/kallsyms.h>
 #include <linux/tick.h>
+#include <trace/irq.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -54,6 +57,20 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
+void ltt_dump_softirq_vec(void *call_data)
+{
+	int i;
+	char namebuf[KSYM_NAME_LEN];
+
+	for (i = 0; i < 32; i++) {
+		sprint_symbol(namebuf, (unsigned long)softirq_vec[i].action);
+		__trace_mark(0, softirq_state, softirq_vec, call_data,
+			"id %d address %p symbol %s",
+			i, softirq_vec[i].action, namebuf);
+	}
+}
+EXPORT_SYMBOL_GPL(ltt_dump_softirq_vec);
+
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
@@ -61,6 +78,11 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
+DEFINE_TRACE(irq_tasklet_high_entry);
+DEFINE_TRACE(irq_tasklet_high_exit);
+DEFINE_TRACE(irq_tasklet_low_entry);
+DEFINE_TRACE(irq_tasklet_low_exit);
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -341,6 +363,7 @@ void irq_exit(void)
  */
 inline void raise_softirq_irqoff(unsigned int nr)
 {
+	trace_softirq_raise(nr);
 	__raise_softirq_irqoff(nr);
 
 	/*
@@ -440,7 +463,9 @@ static void tasklet_action(struct softirq_action *a)
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
+				trace_irq_tasklet_low_entry(t);
 				t->func(t->data);
+				trace_irq_tasklet_low_exit(t);
 				tasklet_unlock(t);
 				continue;
 			}
@@ -475,7 +500,9 @@ static void tasklet_hi_action(struct softirq_action *a)
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
+				trace_irq_tasklet_high_entry(t);
 				t->func(t->data);
+				trace_irq_tasklet_high_exit(t);
 				tasklet_unlock(t);
 				continue;
 			}
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06..dbaa0648631 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o
 obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
+obj-$(CONFIG_HAVE_UNSYNCHRONIZED_TSC)		+= tsc-sync.o
diff --git a/kernel/time/tsc-sync.c b/kernel/time/tsc-sync.c
new file mode 100644
index 00000000000..2ac1544ee22
--- /dev/null
+++ b/kernel/time/tsc-sync.c
@@ -0,0 +1,313 @@
+/*
+ * kernel/time/tsc-sync.c
+ *
+ * Test TSC synchronization
+ *
+ * marks the tsc as unstable _and_ keep a simple "_tsc_is_sync" variable, which
+ * is fast to read when a simple test must determine which clock source to use
+ * for kernel tracing.
+ *
+ * - CPU init :
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code.
+ *
+ * - When CPUs are up :
+ *
+ * TSC synchronicity of all CPUs can be checked later at run-time by calling
+ * test_tsc_synchronization().
+ *
+ * Copyright 2007, 2008
+ *    Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ */
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/trace-clock.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#define MAX_CYCLES_DELTA 3000ULL
+
+/*
+ * Number of loops to take care of MCE, NMIs, SMIs.
+ */
+#define NR_LOOPS	200
+
+static DEFINE_MUTEX(tscsync_mutex);
+
+struct sync_data {
+	int nr_waits;
+	int wait_sync;
+	cycles_t tsc_count;
+} ____cacheline_aligned;
+
+/* 0 is master, 1 is slave */
+static struct sync_data sync_data[2] = {
+	[0 ... 1] = {
+		.nr_waits = 3 * NR_LOOPS + 1,
+		.wait_sync = 3 * NR_LOOPS + 1,
+	},
+};
+
+int _tsc_is_sync = 1;
+EXPORT_SYMBOL(_tsc_is_sync);
+
+static int force_tsc_sync;
+static cycles_t slave_offset;
+static int slave_offset_ready;	/* for 32-bits architectures */
+
+static int __init force_tsc_sync_setup(char *str)
+{
+	force_tsc_sync = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("force_tsc_sync=", force_tsc_sync_setup);
+
+/*
+ * Mark it noinline so we make sure it is not unrolled.
+ * Wait until value is reached.
+ */
+static noinline void tsc_barrier(long this_cpu)
+{
+	sync_core();
+	sync_data[this_cpu].wait_sync--;
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+			sync_data[this_cpu].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	sync_data[this_cpu].nr_waits--;
+	get_cycles_barrier();
+	sync_data[this_cpu].tsc_count = get_cycles();
+	get_cycles_barrier();
+}
+
+/*
+ * Worker thread called on each CPU.
+ * First wait with interrupts enabled, then wait with interrupt disabled,
+ * for precision. We are already bound to one CPU.
+ * this_cpu 0 : master
+ * this_cpu 1 : slave
+ */
+static void test_sync(void *arg)
+{
+	long this_cpu = (long)arg;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	/* Make sure the instructions are in I-CACHE */
+	tsc_barrier(this_cpu);
+	tsc_barrier(this_cpu);
+	sync_data[this_cpu].wait_sync--;
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+			sync_data[this_cpu].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	sync_data[this_cpu].nr_waits--;
+	/*
+	 * Here, only the master will wait for the slave to reach this barrier.
+	 * This makes sure that the master, which holds the mutex and will reset
+	 * the barriers, waits for the slave to stop using the barrier values
+	 * before it continues. This is only done at the complete end of all the
+	 * loops. This is why there is a + 1 in original wait_sync value.
+	 */
+	if (sync_data[this_cpu].nr_waits == 1)
+		sync_data[this_cpu].wait_sync--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Each CPU (master and target) must decrement the wait_sync value twice (one
+ * for priming in cache), and also once after the get_cycles. After all the
+ * loops, one last synchronization is required to make sure the master waits
+ * for the slave before resetting the barriers.
+ */
+static void reset_barriers(void)
+{
+	int i;
+
+	/*
+	 * Wait until slave is done so that we don't overwrite
+	 * wait_end_sync prematurely.
+	 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1].wait_sync >= sync_data[0].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+
+	for (i = 0; i < 2; i++) {
+		WARN_ON(sync_data[i].wait_sync != 0);
+		WARN_ON(sync_data[i].nr_waits != 1);
+		sync_data[i].wait_sync = 3 * NR_LOOPS + 1;
+		sync_data[i].nr_waits = 3 * NR_LOOPS + 1;
+	}
+}
+
+/*
+ * Do loops (making sure no unexpected event changes the timing), keep the best
+ * one. The result of each loop is the highest tsc delta between the master CPU
+ * and the slaves. Stop CPU hotplug when this code is executed to make sure we
+ * are concurrency-safe wrt CPU hotplug also using this code.  Test TSC
+ * synchronization even if we already "know" CPUs were not synchronized. This
+ * can be used as a test to check if, for some reason, the CPUs eventually got
+ * in sync after a CPU has been unplugged. This code is kept separate from the
+ * CPU hotplug code because the slave CPU executes in an IPI, which we want to
+ * keep as short as possible (this is happening while the system is running).
+ * Therefore, we do not send a single IPI for all the test loops, but rather
+ * send one IPI per loop.
+ */
+int test_tsc_synchronization(void)
+{
+	long cpu, master;
+	cycles_t max_diff = 0, diff, best_loop, worse_loop = 0;
+	int i;
+
+	mutex_lock(&tscsync_mutex);
+	get_online_cpus();
+
+	printk(KERN_INFO
+	       "checking TSC synchronization across all online CPUs:");
+
+	preempt_disable();
+	master = smp_processor_id();
+	for_each_online_cpu(cpu) {
+		if (master == cpu)
+			continue;
+		best_loop = (cycles_t)ULLONG_MAX;
+		for (i = 0; i < NR_LOOPS; i++) {
+			smp_call_function_single(cpu, test_sync,
+						(void *)1UL, 0);
+			test_sync((void *)0UL);
+			diff = abs(sync_data[1].tsc_count
+				- sync_data[0].tsc_count);
+			best_loop = min(best_loop, diff);
+			worse_loop = max(worse_loop, diff);
+		}
+		reset_barriers();
+		max_diff = max(best_loop, max_diff);
+	}
+	preempt_enable();
+	if (max_diff >= MAX_CYCLES_DELTA) {
+		printk(KERN_WARNING
+			"Measured %llu cycles TSC offset between CPUs,"
+			" turning off TSC clock.\n", (u64)max_diff);
+		mark_tsc_unstable("check_tsc_sync_source failed");
+		_tsc_is_sync = 0;
+	} else {
+		printk(" passed.\n");
+	}
+	put_online_cpus();
+	mutex_unlock(&tscsync_mutex);
+	return max_diff < MAX_CYCLES_DELTA;
+}
+EXPORT_SYMBOL_GPL(test_tsc_synchronization);
+
+/*
+ * Test synchronicity of a single core when it is hotplugged.
+ * Source CPU calls into this - waits for the freshly booted target CPU to
+ * arrive and then start the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+	cycles_t diff, abs_diff,
+		 best_loop = (cycles_t)ULLONG_MAX, worse_loop = 0;
+	int i;
+
+	/*
+	 * No need to check if we already know that the TSC is not synchronized:
+	 */
+	if (!force_tsc_sync && unsynchronized_tsc()) {
+		/*
+		 * Make sure we mark _tsc_is_sync to 0 if the TSC is found
+		 * to be unsynchronized for other causes than non-synchronized
+		 * TSCs across CPUs.
+		 */
+		_tsc_is_sync = 0;
+		set_trace_clock_is_sync(0);
+		return;
+	}
+
+	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+			  smp_processor_id(), cpu);
+
+	for (i = 0; i < NR_LOOPS; i++) {
+		test_sync((void *)0UL);
+		diff = sync_data[1].tsc_count - sync_data[0].tsc_count;
+		abs_diff = abs(diff);
+		best_loop = min(best_loop, abs_diff);
+		worse_loop = max(worse_loop, abs_diff);
+		if (force_tsc_sync && best_loop == abs_diff)
+			slave_offset = diff;
+	}
+	reset_barriers();
+
+	if (!force_tsc_sync && best_loop >= MAX_CYCLES_DELTA) {
+		printk(" failed.\n");
+		printk(KERN_WARNING
+			"Measured %llu cycles TSC offset between CPUs,"
+			" turning off TSC clock.\n", (u64)best_loop);
+		mark_tsc_unstable("check_tsc_sync_source failed");
+		_tsc_is_sync = 0;
+		set_trace_clock_is_sync(0);
+	} else {
+		printk(" %s.\n", !force_tsc_sync ? "passed" : "forced");
+	}
+	if (force_tsc_sync) {
+		/* order slave_offset and slave_offset_ready writes */
+		smp_wmb();
+		slave_offset_ready = 1;
+	}
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+	int i;
+
+	if (!force_tsc_sync && unsynchronized_tsc())
+		return;
+
+	for (i = 0; i < NR_LOOPS; i++)
+		test_sync((void *)1UL);
+
+	/*
+	 * Force slave synchronization if requested.
+	 */
+	if (force_tsc_sync) {
+		unsigned long flags;
+		cycles_t new_tsc;
+
+		while (!slave_offset_ready)
+			cpu_relax();
+		/* order slave_offset and slave_offset_ready reads */
+		smp_rmb();
+		local_irq_save(flags);
+		/*
+		 * slave_offset is read when master has finished writing to it,
+		 * and is protected by cpu hotplug serialization.
+		 */
+		new_tsc = get_cycles() - slave_offset;
+		write_tsc((u32)new_tsc, (u32)((u64)new_tsc >> 32));
+		local_irq_restore(flags);
+	}
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d24..65cc58ce148 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -40,12 +40,14 @@
 #include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <trace/timer.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
 #include <asm/io.h>
+#include <asm/irq_regs.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
@@ -54,6 +56,10 @@ u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
 
+DEFINE_TRACE(timer_set);
+DEFINE_TRACE(timer_update_time);
+DEFINE_TRACE(timer_timeout);
+
 /*
  * per-CPU timer vector definitions:
  */
@@ -366,6 +372,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
 	}
+	trace_timer_set(timer);
 	/*
 	 * Timers are FIFO:
 	 */
@@ -1303,8 +1310,13 @@ void run_local_timers(void)
 
 void do_timer(unsigned long ticks)
 {
+	struct timespec curtime, wtom;
+
 	jiffies_64 += ticks;
 	update_wall_time();
+	curtime = __current_kernel_time();
+	wtom = __get_wall_to_monotonic();
+	trace_timer_update_time(&curtime, &wtom);
 	calc_global_load(ticks);
 }
 
@@ -1387,7 +1399,9 @@ SYSCALL_DEFINE0(getegid)
 
 static void process_timeout(unsigned long __data)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct task_struct *task = (struct task_struct *)__data;
+	trace_timer_timeout(task);
+	wake_up_process(task);
 }
 
 /**
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..614d9153a24 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace-clock-32-to-64.o
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace-clock.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace-clock-32-to-64.c b/kernel/trace/trace-clock-32-to-64.c
new file mode 100644
index 00000000000..c036f5c5586
--- /dev/null
+++ b/kernel/trace/trace-clock-32-to-64.c
@@ -0,0 +1,296 @@
+/*
+ * kernel/trace/trace-clock-32-to-64.c
+ *
+ * (C) Copyright	2006,2007,2008 -
+ * 		Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Extends a 32 bits clock source to a full 64 bits count, readable atomically
+ * from any execution context.
+ *
+ * notes :
+ * - trace clock 32->64 bits extended timer-based clock cannot be used for early
+ *   tracing in the boot process, as it depends on timer interrupts.
+ * - The timer is only on one CPU to support hotplug.
+ * - We have the choice between schedule_delayed_work_on and an IPI to get each
+ *   CPU to write the heartbeat. IPI has been chosen because it is considered
+ *   faster than passing through the timer to get the work scheduled on all the
+ *   CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* needed due to include order problem on m68k */
+#include <linux/math64.h>
+
+#define HW_BITMASK			((1ULL << TC_HW_BITS) - 1)
+#define HW_LS32(hw)			((hw) & HW_BITMASK)
+#define SW_MS32(sw)			((sw) & ~HW_BITMASK)
+
+static DEFINE_SPINLOCK(synthetic_tsc_lock);
+static int synthetic_tsc_refcount;  /* Number of readers */
+static int synthetic_tsc_enabled;   /* synth. TSC enabled on all online CPUs */
+
+static DEFINE_PER_CPU(struct timer_list, tsc_timer);
+static unsigned int precalc_expire;
+
+struct synthetic_tsc_struct {
+	union {
+		u64 val;
+		struct {
+#ifdef __BIG_ENDIAN
+			u32 ms32;
+			u32 ls32;
+#else
+			u32 ls32;
+			u32 ms32;
+#endif
+		} sel;
+	} tsc[2];
+	unsigned int index;	/* Index of the current synth. tsc. */
+};
+
+static DEFINE_PER_CPU(struct synthetic_tsc_struct, synthetic_tsc);
+
+/* Called from IPI or timer interrupt */
+static void update_synthetic_tsc(void)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u32 tsc;
+
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	tsc = trace_clock_read32();		/* Hardware clocksource read */
+
+	if (tsc < HW_LS32(cpu_synth->tsc[cpu_synth->index].sel.ls32)) {
+		unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+		/*
+		 * Overflow
+		 * Non atomic update of the non current synthetic TSC, followed
+		 * by an atomic index change. There is no write concurrency,
+		 * so the index read/write does not need to be atomic.
+		 */
+		cpu_synth->tsc[new_index].val =
+			(SW_MS32(cpu_synth->tsc[cpu_synth->index].val)
+				| (u64)tsc) + (1ULL << TC_HW_BITS);
+		/*
+		 * Ensure the compiler does not reorder index write. It makes
+		 * sure all nested interrupts will see the new value before the
+		 * new index is written.
+		 */
+		barrier();
+		cpu_synth->index = new_index;	/* atomic change of index */
+	} else {
+		/*
+		 * No overflow : We know that the only bits changed are
+		 * contained in the 32 LS32s, which can be written to atomically.
+		 */
+		cpu_synth->tsc[cpu_synth->index].sel.ls32 =
+			SW_MS32(cpu_synth->tsc[cpu_synth->index].sel.ls32) | tsc;
+	}
+}
+
+/*
+ * Should only be called when interrupts are off. Affects only current CPU.
+ */
+void _trace_clock_write_synthetic_tsc(u64 value)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	unsigned int new_index;
+
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+	cpu_synth->tsc[new_index].val = value;
+	barrier();
+	cpu_synth->index = new_index;	/* atomic change of index */
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 notrace trace_clock_read_synthetic_tsc(void)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u64 ret;
+	unsigned int index;
+	u32 tsc;
+
+	preempt_disable_notrace();
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	index = ACCESS_ONCE(cpu_synth->index);	/* atomic read */
+	tsc = trace_clock_read32();		/* Hardware clocksource read */
+
+	/* Overflow detection */
+	if (unlikely(tsc < HW_LS32(cpu_synth->tsc[index].sel.ls32)))
+		ret = (SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc)
+			+ (1ULL << TC_HW_BITS);
+	else
+		ret = SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc;
+	preempt_enable_notrace();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_clock_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+	update_synthetic_tsc();
+}
+
+/*
+ * tsc_timer_fct : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void tsc_timer_fct(unsigned long data)
+{
+	update_synthetic_tsc();
+
+	mod_timer_pinned(&per_cpu(tsc_timer, smp_processor_id()),
+		  jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the clock
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+	u64 rem_freq, rem_interval;
+
+	precalc_expire =
+		__iter_div_u64_rem(HW_BITMASK, (
+		  __iter_div_u64_rem(trace_clock_frequency(),
+		  HZ * trace_clock_freq_scale(), &rem_freq) << 1
+		 )
+		 - 1
+		 - (TC_EXPECTED_INTERRUPT_LATENCY * HZ / 1000), &rem_interval)
+		>> 1;
+	WARN_ON(precalc_expire == 0);
+	printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+		precalc_expire);
+	return 0;
+}
+
+static void prepare_synthetic_tsc(int cpu)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u64 local_count;
+
+	cpu_synth = &per_cpu(synthetic_tsc, cpu);
+	local_count = trace_clock_read_synthetic_tsc();
+	cpu_synth->tsc[0].val = local_count;
+	cpu_synth->index = 0;
+	smp_wmb();	/* Writing in data of CPU about to come up */
+	init_timer_deferrable(&per_cpu(tsc_timer, cpu));
+	per_cpu(tsc_timer, cpu).function = tsc_timer_fct;
+	per_cpu(tsc_timer, cpu).expires = jiffies + precalc_expire;
+}
+
+static void enable_synthetic_tsc(int cpu)
+{
+	smp_call_function_single(cpu, synthetic_tsc_ipi, NULL, 1);
+	add_timer_on(&per_cpu(tsc_timer, cpu), cpu);
+}
+
+static void disable_synthetic_tsc(int cpu)
+{
+	del_timer_sync(&per_cpu(tsc_timer, cpu));
+}
+
+/*
+ * 	hotcpu_callback - CPU hotplug callback
+ * 	@nb: notifier block
+ * 	@action: hotplug action to take
+ * 	@hcpu: CPU number
+ *
+ *	Sets the new CPU's current synthetic TSC to the same value as the
+ *	currently running CPU.
+ *
+ * 	Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+				unsigned long action,
+				void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			prepare_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			enable_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			disable_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+void get_synthetic_tsc(void)
+{
+	int cpu;
+
+	spin_lock(&synthetic_tsc_lock);
+	if (synthetic_tsc_refcount++)
+		goto end;
+
+	synthetic_tsc_enabled = 1;
+	for_each_online_cpu(cpu) {
+		prepare_synthetic_tsc(cpu);
+		enable_synthetic_tsc(cpu);
+	}
+end:
+	spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(get_synthetic_tsc);
+
+void put_synthetic_tsc(void)
+{
+	int cpu;
+
+	spin_lock(&synthetic_tsc_lock);
+	WARN_ON(synthetic_tsc_refcount <= 0);
+	if (synthetic_tsc_refcount != 1 || !synthetic_tsc_enabled)
+		goto end;
+
+	for_each_online_cpu(cpu)
+		disable_synthetic_tsc(cpu);
+	synthetic_tsc_enabled = 0;
+end:
+	synthetic_tsc_refcount--;
+	spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(put_synthetic_tsc);
+
+/* Called from CPU 0, before any tracing starts, to init each structure */
+static int __init init_synthetic_tsc(void)
+{
+	precalc_stsc_interval();
+	hotcpu_notifier(hotcpu_callback, 3);
+	return 0;
+}
+
+/* Before SMP is up */
+/* workaround for omap4 */
+late_initcall(init_synthetic_tsc);
diff --git a/kernel/trace/trace-clock.c b/kernel/trace/trace-clock.c
new file mode 100644
index 00000000000..3ed1667aacb
--- /dev/null
+++ b/kernel/trace/trace-clock.c
@@ -0,0 +1,97 @@
+/*
+ * kernel/trace/trace-clock.c
+ *
+ * (C) Copyright	2008 -
+ * 		Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Generic kernel tracing clock for architectures without TSC.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/jiffies.h>
+
+static int trace_clock_refcount;
+static DEFINE_MUTEX(trace_clock_mutex);
+static struct timer_list trace_clock_timer;
+/*
+ * bits 0..12 : counter, atomically incremented
+ * bits 13..{32,64} : time counter, incremented each jiffy.
+ */
+atomic_long_t trace_clock_var;
+EXPORT_SYMBOL(trace_clock_var);
+
+static void trace_clock_update(void)
+{
+	long old_clock, new_clock;
+	unsigned long ticks;
+
+	/*
+	 * Make sure we keep track of delayed timer.
+	 */
+	ticks = jiffies - trace_clock_timer.expires + 1;
+	/* Don't update if ticks is zero, time would go backward. */
+	if (unlikely(!ticks))
+		return;
+	do {
+		old_clock = atomic_long_read(&trace_clock_var);
+		new_clock = (old_clock + (ticks << TRACE_CLOCK_SHIFT))
+			& (~((1 << TRACE_CLOCK_SHIFT) - 1));
+	} while (atomic_long_cmpxchg(&trace_clock_var, old_clock, new_clock)
+			!= old_clock);
+}
+
+static void trace_clock_timer_fct(unsigned long data)
+{
+	trace_clock_update();
+	trace_clock_timer.expires = jiffies + 1;
+	add_timer(&trace_clock_timer);
+}
+
+static void enable_trace_clock(void)
+{
+	init_timer(&trace_clock_timer);
+	/* trace_clock_update() reads expires */
+	trace_clock_timer.function = trace_clock_timer_fct;
+	trace_clock_timer.expires = jiffies + 1;
+	trace_clock_update();
+	add_timer(&trace_clock_timer);
+}
+
+static void disable_trace_clock(void)
+{
+	del_timer_sync(&trace_clock_timer);
+}
+
+void get_trace_clock(void)
+{
+	get_synthetic_tsc();
+	mutex_lock(&trace_clock_mutex);
+	if (trace_clock_refcount++)
+		goto end;
+	enable_trace_clock();
+end:
+	mutex_unlock(&trace_clock_mutex);
+}
+EXPORT_SYMBOL_GPL(get_trace_clock);
+
+void put_trace_clock(void)
+{
+	mutex_lock(&trace_clock_mutex);
+	WARN_ON(trace_clock_refcount <= 0);
+	if (trace_clock_refcount != 1)
+		goto end;
+	disable_trace_clock();
+end:
+	trace_clock_refcount--;
+	mutex_unlock(&trace_clock_mutex);
+	put_synthetic_tsc();
+}
+EXPORT_SYMBOL_GPL(put_trace_clock);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf..687699d365a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/marker.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>