aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/futex.c125
-rw-r--r--kernel/irq/handle.c8
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/lockdep.c22
-rw-r--r--kernel/ltt-channels.c388
-rw-r--r--kernel/marker.c1262
-rw-r--r--kernel/module.c110
-rw-r--r--kernel/notifier.c31
-rw-r--r--kernel/panic.c7
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/rcutree.c7
-rw-r--r--kernel/sched.c54
-rw-r--r--kernel/softirq.c27
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/tsc-sync.c313
-rw-r--r--kernel/timer.c16
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/trace-clock-32-to-64.c296
-rw-r--r--kernel/trace/trace-clock.c97
-rw-r--r--kernel/trace/trace_printk.c1
25 files changed, 2740 insertions, 69 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba3..c039580ba3b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
@@ -99,7 +100,10 @@ obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_MARKERS) += ltt-channels.o
obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b..0d9a3444614 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -514,6 +514,8 @@ struct files_struct *get_files_struct(struct task_struct *task)
return files;
}
+EXPORT_SYMBOL(get_files_struct);
+
void put_files_struct(struct files_struct *files)
{
struct fdtable *fdt;
@@ -535,6 +537,8 @@ void put_files_struct(struct files_struct *files)
}
}
+EXPORT_SYMBOL(put_files_struct);
+
void reset_files_struct(struct files_struct *files)
{
struct task_struct *tsk = current;
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152dd..5bb0bb18434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@ int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+EXPORT_SYMBOL(tasklist_lock);
#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
@@ -1250,6 +1251,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
+ /*
+ * The state of the parent's TIF_KTRACE flag may have changed
+ * since it was copied in dup_task_struct() so we re-copy it here.
+ */
+ if (test_thread_flag(TIF_KERNEL_TRACE))
+ set_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+ else
+ clear_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d5065e8283d..3cd76c15b09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
return NULL;
}
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
{
- u32 curval;
+ int ret;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
- return curval;
+ return ret;
}
static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
int lock_taken, ret, ownerdied = 0;
- u32 uval, newval, curval;
+ u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = task_pid_vnr(task);
+ newval = vpid;
if (set_waiters)
newval |= FUTEX_WAITERS;
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
@@ -723,14 +722,12 @@ retry:
*/
if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
/* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ newval = (curval & ~FUTEX_TID_MASK) | vpid;
ownerdied = 0;
lock_taken = 1;
}
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
if (unlikely(curval != uval))
goto retry;
@@ -775,6 +772,24 @@ retry:
return ret;
}
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+ || plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
*/
get_task_struct(p);
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
* q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
- if (oldval == -EFAULT)
- return oldval;
+ if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+ return -EFAULT;
if (oldval != uval)
return -EAGAIN;
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
plist_del(&q->list, &hb1->chain);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb2->lock;
-#endif
}
get_futex_key_refs(key2);
q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
get_futex_key_refs(key);
q->key = *key;
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(q->pi_state);
@@ -1525,8 +1525,7 @@ retry:
static void unqueue_me_pi(struct futex_q *q)
__releases(q->lock_ptr)
{
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
@@ -1578,9 +1577,7 @@ retry:
while (1) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
goto handle_fault;
if (curval == uval)
break;
@@ -1781,13 +1778,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2044,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
+ u32 uval, vpid = task_pid_vnr(current);
int ret;
retry:
@@ -2057,7 +2055,7 @@ retry:
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2070,14 @@ retry:
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
- if (unlikely(uval == -EFAULT))
+ if (!(uval & FUTEX_OWNER_DIED) &&
+ cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == task_pid_vnr(current)))
+ if (unlikely(uval == vpid))
goto out_unlock;
/*
@@ -2167,7 +2162,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We were woken prior to requeue by a timeout or a signal.
* Unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &q->list.plist);
+ plist_del(&q->list, &hb->chain);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2463,11 +2458,20 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
- if (nval == -EFAULT)
- return -1;
-
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
if (nval != uval)
goto retry;
@@ -2678,8 +2682,7 @@ static int __init futex_init(void)
* implementation, the non-functional ones will return
* -ENOSYS.
*/
- curval = cmpxchg_futex_value_locked(NULL, 0, 0);
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a719012..db864334a95 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
#include <linux/kernel_stat.h>
#include <trace/events/irq.h>
+#include <trace/irq.h>
#include "internals.h"
@@ -51,6 +52,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
+DEFINE_TRACE(irq_entry);
+DEFINE_TRACE(irq_exit);
+
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
@@ -63,6 +67,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
+ trace_irq_entry(irq, NULL, action);
+
do {
trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
@@ -116,5 +122,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
add_interrupt_randomness(irq);
local_irq_disable();
+ trace_irq_exit(retval);
+
return retval;
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bd..1c07afd307f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -109,6 +109,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
+EXPORT_SYMBOL_GPL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
@@ -273,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
+EXPORT_SYMBOL_GPL(irq_to_desc);
struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153d..18fd8e919c0 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -13,9 +13,13 @@
#include <linux/posix-timers.h>
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
+#include <trace/timer.h>
#include <asm/uaccess.h>
+DEFINE_TRACE(timer_itimer_expired);
+DEFINE_TRACE(timer_itimer_set);
+
/**
* itimer_get_remtime - get remaining time for the timer
*
@@ -124,6 +128,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
container_of(timer, struct signal_struct, real_timer);
trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
+ trace_timer_itimer_expired(sig);
kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
return HRTIMER_NORESTART;
@@ -201,6 +206,8 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
!timeval_valid(&value->it_interval))
return -EINVAL;
+ trace_timer_itimer_set(which, value);
+
switch (which) {
case ITIMER_REAL:
again:
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7eb..779f0031929 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/kmsg_dump.h>
+#include <trace/kernel.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -40,6 +41,9 @@
#include <asm/system.h>
#include <asm/sections.h>
+DEFINE_TRACE(kernel_kernel_kexec);
+DEFINE_TRACE(kernel_crash_kexec);
+
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -1066,6 +1070,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
void crash_kexec(struct pt_regs *regs)
{
+ trace_kernel_crash_kexec(kexec_crash_image, regs);
+
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
@@ -1495,6 +1501,8 @@ int kernel_kexec(void)
{
int error = 0;
+ trace_kernel_kernel_kexec(kexec_image);
+
if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
if (!kexec_image) {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f..e0841c537db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,6 +49,8 @@
#include "lockdep_internals.h"
+#include <trace/lockdep.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
@@ -66,6 +68,13 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+DEFINE_TRACE(lockdep_hardirqs_on);
+DEFINE_TRACE(lockdep_hardirqs_off);
+DEFINE_TRACE(lockdep_softirqs_on);
+DEFINE_TRACE(lockdep_softirqs_off);
+DEFINE_TRACE(lockdep_lock_acquire);
+DEFINE_TRACE(lockdep_lock_release);
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -2300,6 +2309,8 @@ void trace_hardirqs_on_caller(unsigned long ip)
time_hardirqs_on(CALLER_ADDR0, ip);
+ trace_lockdep_hardirqs_on(ip);
+
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2358,6 +2369,8 @@ void trace_hardirqs_off_caller(unsigned long ip)
time_hardirqs_off(CALLER_ADDR0, ip);
+ trace_lockdep_hardirqs_off(ip);
+
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2390,6 +2403,8 @@ void trace_softirqs_on(unsigned long ip)
{
struct task_struct *curr = current;
+ trace_lockdep_softirqs_on(ip);
+
if (unlikely(!debug_locks))
return;
@@ -2424,6 +2439,8 @@ void trace_softirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
+ trace_lockdep_softirqs_off(ip);
+
if (unlikely(!debug_locks))
return;
@@ -2730,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int class_idx;
u64 chain_key;
+ trace_lockdep_lock_acquire(ip, subclass, lock, trylock, read,
+ hardirqs_off);
+
if (!prove_locking)
check = 1;
@@ -3108,6 +3128,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
struct task_struct *curr = current;
+ trace_lockdep_lock_release(ip, lock, nested);
+
if (!check_unlock(curr, lock, ip))
return;
diff --git a/kernel/ltt-channels.c b/kernel/ltt-channels.c
new file mode 100644
index 00000000000..102513874ad
--- /dev/null
+++ b/kernel/ltt-channels.c
@@ -0,0 +1,388 @@
+/*
+ * ltt/ltt-channels.c
+ *
+ * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng channel management.
+ *
+ * Author:
+ * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/ltt-channels.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * ltt_channel_mutex may be nested inside the LTT trace mutex.
+ * ltt_channel_mutex mutex may be nested inside markers mutex.
+ */
+static DEFINE_MUTEX(ltt_channel_mutex);
+static LIST_HEAD(ltt_channels);
+/*
+ * Index of next channel in array. Makes sure that as long as a trace channel is
+ * allocated, no array index will be re-used when a channel is freed and then
+ * another channel is allocated. This index is cleared and the array indexeds
+ * get reassigned when the index_kref goes back to 0, which indicates that no
+ * more trace channels are allocated.
+ */
+static unsigned int free_index;
+/* index_kref is protected by both ltt_channel_mutex and lock_markers */
+static struct kref index_kref; /* Keeps track of allocated trace channels */
+
+static struct ltt_channel_setting *lookup_channel(const char *name)
+{
+ struct ltt_channel_setting *iter;
+
+ list_for_each_entry(iter, &ltt_channels, list)
+ if (strcmp(name, iter->name) == 0)
+ return iter;
+ return NULL;
+}
+
+/*
+ * Must be called when channel refcount falls to 0 _and_ also when the last
+ * trace is freed. This function is responsible for compacting the channel and
+ * event IDs when no users are active.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_channel_setting(struct kref *kref)
+{
+ struct ltt_channel_setting *setting = container_of(kref,
+ struct ltt_channel_setting, kref);
+ struct ltt_channel_setting *iter;
+
+ if (atomic_read(&index_kref.refcount) == 0
+ && atomic_read(&setting->kref.refcount) == 0) {
+ list_del(&setting->list);
+ kfree(setting);
+
+ free_index = 0;
+ list_for_each_entry(iter, &ltt_channels, list) {
+ iter->index = free_index++;
+ iter->free_event_id = 0;
+ }
+ }
+}
+
+/*
+ * Perform channel index compaction when the last trace channel is freed.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_trace_channel(struct kref *kref)
+{
+ struct ltt_channel_setting *iter, *n;
+
+ list_for_each_entry_safe(iter, n, &ltt_channels, list)
+ release_channel_setting(&iter->kref);
+ if (atomic_read(&index_kref.refcount) == 0)
+ markers_compact_event_ids();
+}
+
+/*
+ * ltt_channel_trace_ref : Is there an existing trace session ?
+ *
+ * Must be called with lock_markers() held.
+ */
+int ltt_channels_trace_ref(void)
+{
+ return !!atomic_read(&index_kref.refcount);
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_ref);
+
+/**
+ * ltt_channels_register - Register a trace channel.
+ * @name: channel name
+ *
+ * Uses refcounting.
+ */
+int ltt_channels_register(const char *name)
+{
+ struct ltt_channel_setting *setting;
+ int ret = 0;
+
+ mutex_lock(&ltt_channel_mutex);
+ setting = lookup_channel(name);
+ if (setting) {
+ if (atomic_read(&setting->kref.refcount) == 0)
+ goto init_kref;
+ else {
+ kref_get(&setting->kref);
+ goto end;
+ }
+ }
+ setting = kzalloc(sizeof(*setting), GFP_KERNEL);
+ if (!setting) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ list_add(&setting->list, &ltt_channels);
+ strncpy(setting->name, name, PATH_MAX-1);
+ setting->index = free_index++;
+init_kref:
+ kref_init(&setting->kref);
+end:
+ mutex_unlock(&ltt_channel_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_register);
+
+/**
+ * ltt_channels_unregister - Unregister a trace channel.
+ * @name: channel name
+ * @compacting: performing compaction
+ *
+ * Must be called with markers mutex held.
+ */
+int ltt_channels_unregister(const char *name, int compacting)
+{
+ struct ltt_channel_setting *setting;
+ int ret = 0;
+
+ if (!compacting)
+ mutex_lock(&ltt_channel_mutex);
+ setting = lookup_channel(name);
+ if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+ kref_put(&setting->kref, release_channel_setting);
+ if (!compacting && atomic_read(&index_kref.refcount) == 0)
+ markers_compact_event_ids();
+end:
+ if (!compacting)
+ mutex_unlock(&ltt_channel_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_unregister);
+
+/**
+ * ltt_channels_set_default - Set channel default behavior.
+ * @name: default channel name
+ * @sb_size: size of the subbuffers
+ * @n_sb: number of subbuffers
+ */
+int ltt_channels_set_default(const char *name,
+ unsigned int sb_size,
+ unsigned int n_sb)
+{
+ struct ltt_channel_setting *setting;
+ int ret = 0;
+
+ mutex_lock(&ltt_channel_mutex);
+ setting = lookup_channel(name);
+ if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+ setting->sb_size = sb_size;
+ setting->n_sb = n_sb;
+end:
+ mutex_unlock(&ltt_channel_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_set_default);
+
+/**
+ * ltt_channels_get_name_from_index - get channel name from channel index
+ * @index: channel index
+ *
+ * Allows to lookup the channel name given its index. Done to keep the name
+ * information outside of each trace channel instance.
+ */
+const char *ltt_channels_get_name_from_index(unsigned int index)
+{
+ struct ltt_channel_setting *iter;
+
+ list_for_each_entry(iter, &ltt_channels, list)
+ if (iter->index == index && atomic_read(&iter->kref.refcount))
+ return iter->name;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index);
+
+static struct ltt_channel_setting *
+ltt_channels_get_setting_from_name(const char *name)
+{
+ struct ltt_channel_setting *iter;
+
+ list_for_each_entry(iter, &ltt_channels, list)
+ if (!strcmp(iter->name, name)
+ && atomic_read(&iter->kref.refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * ltt_channels_get_index_from_name - get channel index from channel name
+ * @name: channel name
+ *
+ * Allows to lookup the channel index given its name. Done to keep the name
+ * information outside of each trace channel instance.
+ * Returns -1 if not found.
+ */
+int ltt_channels_get_index_from_name(const char *name)
+{
+ struct ltt_channel_setting *setting;
+
+ setting = ltt_channels_get_setting_from_name(name);
+ if (setting)
+ return setting->index;
+ else
+ return -1;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name);
+
+/**
+ * ltt_channels_trace_alloc - Allocate channel structures for a trace
+ * @sb_size: subbuffer size. 0 uses default.
+ * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default.
+ * @flags: Default channel flags
+ *
+ * Use the current channel list to allocate the channels for a trace.
+ * Called with trace lock held. Does not perform the trace buffer allocation,
+ * because we must let the user overwrite specific channel sizes.
+ */
+struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels,
+ int overwrite, int active)
+{
+ struct ltt_chan *chan = NULL;
+ struct ltt_channel_setting *iter;
+
+ lock_markers();
+ mutex_lock(&ltt_channel_mutex);
+ if (!free_index)
+ goto end;
+ if (!atomic_read(&index_kref.refcount))
+ kref_init(&index_kref);
+ else
+ kref_get(&index_kref);
+ *nr_channels = free_index;
+ chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL);
+ if (!chan)
+ goto end;
+ list_for_each_entry(iter, &ltt_channels, list) {
+ if (!atomic_read(&iter->kref.refcount))
+ continue;
+ chan[iter->index].a.sb_size = iter->sb_size;
+ chan[iter->index].a.n_sb = iter->n_sb;
+ chan[iter->index].overwrite = overwrite;
+ chan[iter->index].active = active;
+ strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1);
+ chan[iter->index].switch_timer_interval = 0;
+ }
+end:
+ mutex_unlock(&ltt_channel_mutex);
+ unlock_markers();
+ return chan;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc);
+
+/**
+ * ltt_channels_trace_free - Free one trace's channels
+ * @channels: channels to free
+ *
+ * Called with trace lock held. The actual channel buffers must be freed before
+ * this function is called.
+ */
+void ltt_channels_trace_free(struct ltt_chan *channels,
+ unsigned int nr_channels)
+{
+ lock_markers();
+ mutex_lock(&ltt_channel_mutex);
+ kfree(channels);
+ kref_put(&index_kref, release_trace_channel);
+ mutex_unlock(&ltt_channel_mutex);
+ unlock_markers();
+ marker_update_probes();
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_free);
+
+/**
+ * ltt_channels_trace_set_timer - set switch timer
+ * @channel: channel
+ * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer.
+ */
+
+void ltt_channels_trace_set_timer(struct ltt_chan *chan,
+ unsigned long interval)
+{
+ chan->switch_timer_interval = interval;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer);
+
+/**
+ * _ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ * Must be called with channels mutex held.
+ */
+int _ltt_channels_get_event_id(const char *channel, const char *name)
+{
+ struct ltt_channel_setting *setting;
+ int ret;
+
+ setting = ltt_channels_get_setting_from_name(channel);
+ if (!setting) {
+ ret = -ENOENT;
+ goto end;
+ }
+ if (strcmp(channel, "metadata") == 0) {
+ if (strcmp(name, "core_marker_id") == 0)
+ ret = 0;
+ else if (strcmp(name, "core_marker_format") == 0)
+ ret = 1;
+ else
+ ret = -ENOENT;
+ goto end;
+ }
+ if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) {
+ ret = -ENOSPC;
+ goto end;
+ }
+ ret = setting->free_event_id++;
+end:
+ return ret;
+}
+
+/**
+ * ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ */
+int ltt_channels_get_event_id(const char *channel, const char *name)
+{
+ int ret;
+
+ mutex_lock(&ltt_channel_mutex);
+ ret = _ltt_channels_get_event_id(channel, name);
+ mutex_unlock(&ltt_channel_mutex);
+ return ret;
+}
+
+/**
+ * ltt_channels_reset_event_ids - reset event IDs at compaction
+ *
+ * Called with lock marker and channel mutex held.
+ */
+void _ltt_channels_reset_event_ids(void)
+{
+ struct ltt_channel_setting *iter;
+
+ list_for_each_entry(iter, &ltt_channels, list)
+ iter->free_event_id = 0;
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management");
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 00000000000..eac8ebfc3b9
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,1262 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/immediate.h>
+#include <linux/ltt-channels.h>
+
+extern struct marker __start___markers[];
+extern struct marker __stop___markers[];
+
+/* Set to 1 to enable marker debug output */
+static const int marker_debug;
+
+/*
+ * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
+ * and module markers and the hash table.
+ * markers_mutex nests inside the trace lock, to ensure event ID consistency
+ * between the hash table and the marker section.
+ */
+static DEFINE_MUTEX(markers_mutex);
+
+void lock_markers(void)
+{
+ mutex_lock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_markers);
+
+void unlock_markers(void)
+{
+ mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_markers);
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+static struct hlist_head id_table[MARKER_TABLE_SIZE];
+
+struct marker_probe_array {
+ struct rcu_head rcu;
+ struct marker_probe_closure c[0];
+};
+
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker. It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
+struct marker_entry {
+ struct hlist_node hlist;
+ struct hlist_node id_list;
+ char *format;
+ char *name;
+ /* Probe wrapper */
+ void (*call)(const struct marker *mdata, void *call_private, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_array *multi;
+ int refcount; /* Number of times armed. 0 if disarmed. */
+ u16 channel_id;
+ u16 event_id;
+ unsigned char ptype:1;
+ unsigned char format_allocated:1;
+ char channel[0]; /* Contains channel'\0'name'\0'format'\0' */
+};
+
+/**
+ * __mark_empty_function - Empty probe callback
+ * @mdata: marker data
+ * @probe_private: probe private data
+ * @call_private: call site private data
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we make sure the execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+notrace void __mark_empty_function(const struct marker *mdata,
+ void *probe_private, void *call_private, const char *fmt, va_list *args)
+{
+}
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...: Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+notrace void marker_probe_cb(const struct marker *mdata,
+ void *call_private, ...)
+{
+ va_list args;
+ char ptype;
+
+ /*
+ * rcu_read_lock_sched does two things : disabling preemption to make
+ * sure the teardown of the callbacks can be done correctly when they
+ * are in modules and they insure RCU read coherency.
+ */
+ rcu_read_lock_sched_notrace();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ va_start(args, call_private);
+ func(mdata, mdata->single.probe_private, call_private,
+ mdata->format, &args);
+ va_end(args);
+ } else {
+ struct marker_probe_array *multi;
+ int i;
+ /*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ for (i = 0; multi->c[i].func; i++) {
+ va_start(args, call_private);
+ multi->c[i].func(mdata, multi->c[i].probe_private,
+ call_private, mdata->format, &args);
+ va_end(args);
+ }
+ }
+ rcu_read_unlock_sched_notrace();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...: Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+ void *call_private, ...)
+{
+ va_list args; /* not initialized */
+ char ptype;
+
+ rcu_read_lock_sched_notrace();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func(mdata, mdata->single.probe_private, call_private,
+ mdata->format, &args);
+ } else {
+ struct marker_probe_array *multi;
+ int i;
+ /*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ for (i = 0; multi->c[i].func; i++)
+ multi->c[i].func(mdata, multi->c[i].probe_private,
+ call_private, mdata->format, &args);
+ }
+ rcu_read_unlock_sched_notrace();
+}
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct marker_probe_array *multi = container_of(head, struct marker_probe_array, rcu);
+ kfree(multi);
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+ int i;
+
+ if (!marker_debug)
+ return;
+
+ if (!entry->ptype) {
+ printk(KERN_DEBUG "Single probe : %p %p\n",
+ entry->single.func,
+ entry->single.probe_private);
+ } else {
+ for (i = 0; entry->multi->c[i].func; i++)
+ printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+ entry->multi->c[i].func,
+ entry->multi->c[i].probe_private);
+ }
+}
+
+static struct marker_probe_array *
+marker_entry_add_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0;
+ struct marker_probe_array *old, *new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->multi;
+ if (!entry->ptype) {
+ if (entry->single.func == probe &&
+ entry->single.probe_private == probe_private)
+ return ERR_PTR(-EBUSY);
+ if (entry->single.func == __mark_empty_function) {
+ /* 0 -> 1 probes */
+ entry->single.func = probe;
+ entry->single.probe_private = probe_private;
+ entry->refcount = 1;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* 1 -> 2 probes */
+ nr_probes = 1;
+ old = NULL;
+ }
+ } else {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old->c[nr_probes].func; nr_probes++)
+ if (old->c[nr_probes].func == probe
+ && old->c[nr_probes].probe_private
+ == probe_private)
+ return ERR_PTR(-EBUSY);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc(sizeof(struct marker_probe_array)
+ + ((nr_probes + 2) * sizeof(struct marker_probe_closure)),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (!old)
+ new->c[0] = entry->single;
+ else
+ memcpy(&new->c[0], &old->c[0],
+ nr_probes * sizeof(struct marker_probe_closure));
+ new->c[nr_probes].func = probe;
+ new->c[nr_probes].probe_private = probe_private;
+ entry->refcount = nr_probes + 1;
+ entry->multi = new;
+ entry->ptype = 1;
+ debug_print_probes(entry);
+ return old;
+}
+
+static struct marker_probe_array *
+marker_entry_remove_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct marker_probe_array *old, *new;
+
+ old = entry->multi;
+
+ debug_print_probes(entry);
+ if (!entry->ptype) {
+ /* 0 -> N is an error */
+ WARN_ON(entry->single.func == __mark_empty_function);
+ /* 1 -> 0 probes */
+ WARN_ON(probe && entry->single.func != probe);
+ WARN_ON(entry->single.probe_private != probe_private);
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) {
+ if ((!probe || old->c[nr_probes].func == probe)
+ && old->c[nr_probes].probe_private
+ == probe_private)
+ nr_del++;
+ }
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ } else if (nr_probes - nr_del == 1) {
+ /* N -> 1, (N > 1) */
+ for (i = 0; old->c[i].func; i++)
+ if ((probe && old->c[i].func != probe) ||
+ old->c[i].probe_private != probe_private)
+ entry->single = old->c[i];
+ entry->refcount = 1;
+ entry->ptype = 0;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 1) */
+ /* + 1 for NULL */
+ new = kzalloc(sizeof(struct marker_probe_array)
+ + ((nr_probes - nr_del + 1)
+ * sizeof(struct marker_probe_closure)),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old->c[i].func; i++)
+ if ((probe && old->c[i].func != probe) ||
+ old->c[i].probe_private != probe_private)
+ new->c[j++] = old->c[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->ptype = 1;
+ entry->multi = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *get_marker(const char *channel, const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t channel_len = strlen(channel) + 1;
+ size_t name_len = strlen(name) + 1;
+ u32 hash;
+
+ hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(channel, e->channel) && !strcmp(name, e->name))
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static struct marker_entry *add_marker(const char *channel, const char *name,
+ const char *format)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t channel_len = strlen(channel) + 1;
+ size_t name_len = strlen(name) + 1;
+ size_t format_len = 0;
+ u32 hash;
+
+ hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+ if (format)
+ format_len = strlen(format) + 1;
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+ printk(KERN_NOTICE
+ "Marker %s.%s busy\n", channel, name);
+ return ERR_PTR(-EBUSY); /* Already there */
+ }
+ }
+ /*
+ * Using kmalloc here to allocate a variable length element. Could
+ * cause some memory fragmentation if overused.
+ */
+ e = kmalloc(sizeof(struct marker_entry)
+ + channel_len + name_len + format_len,
+ GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+ memcpy(e->channel, channel, channel_len);
+ e->name = &e->channel[channel_len];
+ memcpy(e->name, name, name_len);
+ if (format) {
+ e->format = &e->name[name_len];
+ memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ trace_mark(metadata, core_marker_format,
+ "channel %s name %s format %s",
+ e->channel, e->name, e->format);
+ } else {
+ e->format = NULL;
+ e->call = marker_probe_cb;
+ }
+ e->single.func = __mark_empty_function;
+ e->single.probe_private = NULL;
+ e->multi = NULL;
+ e->ptype = 0;
+ e->format_allocated = 0;
+ e->refcount = 0;
+ hlist_add_head(&e->hlist, head);
+ return e;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held. Parameter "registered" indicates if the channel registration has been
+ * performed.
+ */
+static int remove_marker(const char *channel, const char *name, int registered,
+ int compacting)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ int found = 0;
+ size_t channel_len = strlen(channel) + 1;
+ size_t name_len = strlen(name) + 1;
+ u32 hash;
+ int ret;
+
+ hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+ if (e->single.func != __mark_empty_function)
+ return -EBUSY;
+
+ if (registered && ltt_channels_trace_ref())
+ return 0;
+
+ hlist_del(&e->hlist);
+ hlist_del(&e->id_list);
+ if (registered) {
+ ret = ltt_channels_unregister(e->channel, compacting);
+ WARN_ON(ret);
+ }
+ if (e->format_allocated)
+ kfree(e->format);
+ kfree(e);
+ return 0;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int marker_set_format(struct marker_entry *entry, const char *format)
+{
+ entry->format = kstrdup(format, GFP_KERNEL);
+ if (!entry->format)
+ return -ENOMEM;
+ entry->format_allocated = 1;
+
+ trace_mark(metadata, core_marker_format,
+ "channel %s name %s format %s",
+ entry->channel, entry->name, entry->format);
+ return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one marker.
+ */
+static int set_marker(struct marker_entry *entry, struct marker *elem,
+ int active)
+{
+ int ret = 0;
+ WARN_ON(strcmp(entry->name, elem->name) != 0);
+
+ if (entry->format) {
+ if (strcmp(entry->format, elem->format) != 0) {
+ printk(KERN_NOTICE
+ "Format mismatch for probe %s "
+ "(%s), marker (%s)\n",
+ entry->name,
+ entry->format,
+ elem->format);
+ return -EPERM;
+ }
+ } else {
+ ret = marker_set_format(entry, elem->format);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * probe_cb setup (statically known) is done here. It is
+ * asynchronous with the rest of execution, therefore we only
+ * pass from a "safe" callback (with argument) to an "unsafe"
+ * callback (does not set arguments).
+ */
+ elem->call = entry->call;
+ elem->channel_id = entry->channel_id;
+ elem->event_id = entry->event_id;
+ /*
+ * Sanity check :
+ * We only update the single probe private data when the ptr is
+ * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+ */
+ WARN_ON(elem->single.func != __mark_empty_function
+ && elem->single.probe_private != entry->single.probe_private
+ && !elem->ptype);
+ elem->single.probe_private = entry->single.probe_private;
+ /*
+ * Make sure the private data is valid when we update the
+ * single probe ptr.
+ */
+ smp_wmb();
+ elem->single.func = entry->single.func;
+ /*
+ * We also make sure that the new probe callbacks array is consistent
+ * before setting a pointer to it.
+ */
+ rcu_assign_pointer(elem->multi, entry->multi);
+ /*
+ * Update the function or multi probe array pointer before setting the
+ * ptype.
+ */
+ smp_wmb();
+ elem->ptype = entry->ptype;
+
+ if (elem->tp_name && (active ^ _imv_read(elem->state))) {
+ WARN_ON(!elem->tp_cb);
+ /*
+ * It is ok to directly call the probe registration because type
+ * checking has been done in the __trace_mark_tp() macro.
+ */
+
+ if (active) {
+ /*
+ * try_module_get should always succeed because we hold
+ * lock_module() to get the tp_cb address.
+ */
+ ret = try_module_get(__module_text_address(
+ (unsigned long)elem->tp_cb));
+ BUG_ON(!ret);
+ ret = tracepoint_probe_register_noupdate(
+ elem->tp_name,
+ elem->tp_cb, NULL);
+ } else {
+ ret = tracepoint_probe_unregister_noupdate(
+ elem->tp_name,
+ elem->tp_cb, NULL);
+ /*
+ * tracepoint_probe_update_all() must be called
+ * before the module containing tp_cb is unloaded.
+ */
+ module_put(__module_text_address(
+ (unsigned long)elem->tp_cb));
+ }
+ }
+ elem->state__imv = active;
+
+ return ret;
+}
+
+/*
+ * Disable a marker and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by rcu_read_lock_sched around the call site.
+ */
+static void disable_marker(struct marker *elem)
+{
+ int ret;
+
+ /* leave "call" as is. It is known statically. */
+ if (elem->tp_name && _imv_read(elem->state)) {
+ WARN_ON(!elem->tp_cb);
+ /*
+ * It is ok to directly call the probe registration because type
+ * checking has been done in the __trace_mark_tp() macro.
+ */
+ ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+ elem->tp_cb, NULL);
+ WARN_ON(ret);
+ /*
+ * tracepoint_probe_update_all() must be called
+ * before the module containing tp_cb is unloaded.
+ */
+ module_put(__module_text_address((unsigned long)elem->tp_cb));
+ }
+ elem->state__imv = 0;
+ elem->single.func = __mark_empty_function;
+ /* Update the function before setting the ptype */
+ smp_wmb();
+ elem->ptype = 0; /* single probe */
+ /*
+ * Leave the private data and channel_id/event_id there, because removal
+ * is racy and should be done only after an RCU period. These are never
+ * used until the next initialization anyway.
+ */
+}
+
+/*
+ * is_marker_present - Check if a marker is present in kernel.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * We cannot take the marker lock around calls to this function because it needs
+ * to take the module mutex within the iterator. Marker mutex nests inside
+ * module mutex.
+ * Returns 1 if the marker is present, 0 if not.
+ */
+int is_marker_present(const char *channel, const char *name)
+{
+ int ret;
+ struct marker_iter iter;
+
+ ret = 0;
+
+ marker_iter_reset(&iter);
+ marker_iter_start(&iter);
+ for (; iter.marker != NULL; marker_iter_next(&iter)) {
+ if (!strcmp(iter.marker->channel, channel) &&
+ !strcmp(iter.marker->name, name)) {
+ ret = 1;
+ goto end;
+ }
+ }
+end:
+ marker_iter_stop(&iter);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_present);
+
+/*
+ * _is_marker_enabled - Check if a marker is enabled, must be called with
+ * markers_mutex held.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int _is_marker_enabled(const char *channel, const char *name)
+{
+ struct marker_entry *entry;
+
+ entry = get_marker(channel, name);
+
+ return entry && !!entry->refcount;
+}
+EXPORT_SYMBOL_GPL(_is_marker_enabled);
+
+/*
+ * is_marker_enabled - the wrapper of _is_marker_enabled
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int is_marker_enabled(const char *channel, const char *name)
+{
+ int ret;
+
+ lock_markers();
+ ret = _is_marker_enabled(channel, name);
+ unlock_markers();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_enabled);
+
+/**
+ * marker_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of markers.
+ */
+void marker_update_probe_range(struct marker *begin,
+ struct marker *end)
+{
+ struct marker *iter;
+ struct marker_entry *mark_entry;
+
+ mutex_lock(&markers_mutex);
+ for (iter = begin; iter < end; iter++) {
+ mark_entry = get_marker(iter->channel, iter->name);
+ if (mark_entry) {
+ set_marker(mark_entry, iter, !!mark_entry->refcount);
+ /*
+ * ignore error, continue
+ */
+ } else {
+ disable_marker(iter);
+ }
+ }
+ mutex_unlock(&markers_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions. All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
+ */
+void marker_update_probes(void)
+{
+ /* Core kernel markers */
+ marker_update_probe_range(__start___markers, __stop___markers);
+ /* Markers in modules. */
+ module_update_markers();
+ tracepoint_probe_update_all();
+ /* Update immediate values */
+ core_imv_update();
+ module_imv_update();
+}
+
+/**
+ * marker_probe_register - Connect a probe to a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @format: format string
+ * @probe: probe handler
+ * @probe_private: probe private data
+ *
+ * private data must be a valid allocated memory address, or NULL.
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int marker_probe_register(const char *channel, const char *name,
+ const char *format, marker_probe_func *probe,
+ void *probe_private)
+{
+ struct marker_entry *entry;
+ int ret = 0, ret_err;
+ struct marker_probe_array *old;
+ int first_probe = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(channel, name);
+ if (!entry) {
+ first_probe = 1;
+ entry = add_marker(channel, name, format);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ if (ret)
+ goto end;
+ ret = ltt_channels_register(channel);
+ if (ret)
+ goto error_remove_marker;
+ ret = ltt_channels_get_index_from_name(channel);
+ if (ret < 0)
+ goto error_unregister_channel;
+ entry->channel_id = ret;
+ ret = ltt_channels_get_event_id(channel, name);
+ if (ret < 0)
+ goto error_unregister_channel;
+ entry->event_id = ret;
+ hlist_add_head(&entry->id_list, id_table + hash_32(
+ (entry->channel_id << 16) | entry->event_id,
+ MARKER_HASH_BITS));
+ ret = 0;
+ trace_mark(metadata, core_marker_id,
+ "channel %s name %s event_id %hu "
+ "int #1u%zu long #1u%zu pointer #1u%zu "
+ "size_t #1u%zu alignment #1u%u",
+ channel, name, entry->event_id,
+ sizeof(int), sizeof(long), sizeof(void *),
+ sizeof(size_t), ltt_get_alignment());
+ } else if (format) {
+ if (!entry->format)
+ ret = marker_set_format(entry, format);
+ else if (strcmp(entry->format, format))
+ ret = -EPERM;
+ if (ret)
+ goto end;
+ }
+
+ old = marker_entry_add_probe(entry, probe, probe_private);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
+ if (first_probe)
+ goto error_unregister_channel;
+ else
+ goto end;
+ }
+ mutex_unlock(&markers_mutex);
+
+ marker_update_probes();
+ if (old)
+ call_rcu_sched(&old->rcu, free_old_closure);
+ return ret;
+
+error_unregister_channel:
+ ret_err = ltt_channels_unregister(channel, 1);
+ WARN_ON(ret_err);
+error_remove_marker:
+ ret_err = remove_marker(channel, name, 0, 0);
+ WARN_ON(ret_err);
+end:
+ mutex_unlock(&markers_mutex);
+ marker_update_probes(); /* for compaction on error path */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_register);
+
+/**
+ * marker_probe_unregister - Disconnect a probe from a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
+ *
+ * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister(const char *channel, const char *name,
+ marker_probe_func *probe, void *probe_private)
+{
+ struct marker_entry *entry;
+ struct marker_probe_array *old;
+ int ret = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(channel, name);
+ if (!entry) {
+ ret = -ENOENT;
+ goto end;
+ }
+ old = marker_entry_remove_probe(entry, probe, probe_private);
+ remove_marker(channel, name, 1, 0); /* Ignore busy error message */
+ mutex_unlock(&markers_mutex);
+
+ marker_update_probes();
+ if (old)
+ call_rcu_sched(&old->rcu, free_old_closure);
+ return ret;
+
+end:
+ mutex_unlock(&markers_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
+{
+ struct marker_entry *entry;
+ unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node;
+
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ if (!entry->ptype) {
+ if (entry->single.func == probe
+ && entry->single.probe_private
+ == probe_private)
+ return entry;
+ } else {
+ struct marker_probe_array *closure;
+ closure = entry->multi;
+ for (i = 0; closure->c[i].func; i++) {
+ if (closure->c[i].func == probe &&
+ closure->c[i].probe_private
+ == probe_private)
+ return entry;
+ }
+ }
+ }
+ }
+ return NULL;
+}
+
+/**
+ * marker_probe_unregister_private_data - Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
+ *
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *probe_private)
+{
+ struct marker_entry *entry;
+ int ret = 0;
+ struct marker_probe_array *old;
+ const char *channel = NULL, *name = NULL;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker_from_private_data(probe, probe_private);
+ if (!entry) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ old = marker_entry_remove_probe(entry, NULL, probe_private);
+ channel = kstrdup(entry->channel, GFP_KERNEL);
+ name = kstrdup(entry->name, GFP_KERNEL);
+ remove_marker(channel, name, 1, 0); /* Ignore busy error message */
+ mutex_unlock(&markers_mutex);
+
+ marker_update_probes();
+ if (old)
+ call_rcu_sched(&old->rcu, free_old_closure);
+ goto end;
+
+unlock:
+ mutex_unlock(&markers_mutex);
+end:
+ kfree(channel);
+ kfree(name);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
+
+/**
+ * marker_get_private_data - Get a marker's probe private data
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
+ *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
+ * Returns the private data pointer, or an ERR_PTR.
+ * The private data pointer should _only_ be dereferenced if the caller is the
+ * owner of the data, or its content could vanish. This is mostly used to
+ * confirm that a caller is the owner of a registered probe.
+ */
+void *marker_get_private_data(const char *channel, const char *name,
+ marker_probe_func *probe, int num)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t channel_len = strlen(channel) + 1;
+ size_t name_len = strlen(name) + 1;
+ int i;
+ u32 hash;
+
+ hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+ if (!e->ptype) {
+ if (num == 0 && e->single.func == probe)
+ return e->single.probe_private;
+ } else {
+ struct marker_probe_array *closure;
+ int match = 0;
+ closure = e->multi;
+ for (i = 0; closure->c[i].func; i++) {
+ if (closure->c[i].func != probe)
+ continue;
+ if (match++ == num)
+ return closure->c[i].probe_private;
+ }
+ }
+ break;
+ }
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+static struct marker_entry *get_entry_from_id(u16 channel_id, u16 event_id)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e, *found = NULL;
+ u32 hash = hash_32((channel_id << 16) | event_id, MARKER_HASH_BITS);
+
+ mutex_lock(&markers_mutex);
+ head = id_table + hash;
+ hlist_for_each_entry(e, node, head, id_list) {
+ if (e->channel_id == channel_id && e->event_id == event_id) {
+ found = e;
+ break;
+ }
+ }
+ mutex_unlock(&markers_mutex);
+ return found;
+}
+
+/* must call when ids/marker_entry are kept alive */
+const char *marker_get_name_from_id(u16 channel_id, u16 event_id)
+{
+ struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+ return e ? e->name : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_name_from_id);
+
+const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id)
+{
+ struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+ return e ? e->format : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_fmt_from_id);
+
+/**
+ * markers_compact_event_ids - Compact markers event IDs and reassign channels
+ *
+ * Called when no channel users are active by the channel infrastructure.
+ * Called with trace lock, lock_markers() and channel mutex held.
+ *
+ * marker_update_probes() must be executed after compaction before releasing the
+ * trace lock.
+ */
+void markers_compact_event_ids(void)
+{
+ struct marker_entry *entry;
+ unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node, *next;
+ int ret;
+
+ _ltt_channels_reset_event_ids();
+
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry_safe(entry, node, next, head, hlist) {
+ if (!entry->refcount) {
+ remove_marker(entry->channel, entry->name,
+ 1, 1);
+ continue;
+ }
+ ret = ltt_channels_get_index_from_name(entry->channel);
+ WARN_ON(ret < 0);
+ entry->channel_id = ret;
+ ret = _ltt_channels_get_event_id(entry->channel,
+ entry->name);
+ WARN_ON(ret < 0);
+ entry->event_id = ret;
+ }
+ }
+
+ memset(id_table, 0, sizeof(id_table));
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ hlist_add_head(&entry->id_list, id_table + hash_32(
+ (entry->channel_id << 16)
+ | entry->event_id, MARKER_HASH_BITS));
+ }
+ }
+}
+
+#ifdef CONFIG_MODULES
+
+/**
+ * marker_get_iter_range - Get a next marker iterator given a range.
+ * @marker: current markers (in), next marker (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next marker has been found (1) or not (0).
+ * Will return the first marker in the range if the input marker is NULL.
+ */
+int marker_get_iter_range(struct marker **marker, struct marker *begin,
+ struct marker *end)
+{
+ if (!*marker && begin != end) {
+ *marker = begin;
+ return 1;
+ }
+ if (*marker >= begin && *marker < end)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(marker_get_iter_range);
+
+static void marker_get_iter(struct marker_iter *iter)
+{
+ int found = 0;
+
+ /* Core kernel markers */
+ if (!iter->module) {
+ found = marker_get_iter_range(&iter->marker,
+ __start___markers, __stop___markers);
+ if (found)
+ goto end;
+ }
+ /* Markers in modules. */
+ found = module_get_iter_markers(iter);
+end:
+ if (!found)
+ marker_iter_reset(iter);
+}
+
+void marker_iter_start(struct marker_iter *iter)
+{
+ marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_start);
+
+void marker_iter_next(struct marker_iter *iter)
+{
+ iter->marker++;
+ /*
+ * iter->marker may be invalid because we blindly incremented it.
+ * Make sure it is valid by marshalling on the markers, getting the
+ * markers from following modules if necessary.
+ */
+ marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_next);
+
+void marker_iter_stop(struct marker_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(marker_iter_stop);
+
+void marker_iter_reset(struct marker_iter *iter)
+{
+ iter->module = NULL;
+ iter->marker = NULL;
+}
+EXPORT_SYMBOL_GPL(marker_iter_reset);
+
+int marker_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ break;
+ case MODULE_STATE_GOING:
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ break;
+ }
+ return 0;
+}
+
+struct notifier_block marker_module_nb = {
+ .notifier_call = marker_module_notify,
+ .priority = 0,
+};
+
+static int init_markers(void)
+{
+ return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
+
+void ltt_dump_marker_state(struct ltt_trace *trace)
+{
+ struct marker_entry *entry;
+ struct ltt_probe_private_data call_data;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ unsigned int i;
+
+ mutex_lock(&markers_mutex);
+ call_data.trace = trace;
+ call_data.serializer = NULL;
+
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ __trace_mark(0, metadata, core_marker_id,
+ &call_data,
+ "channel %s name %s event_id %hu "
+ "int #1u%zu long #1u%zu pointer #1u%zu "
+ "size_t #1u%zu alignment #1u%u",
+ entry->channel,
+ entry->name,
+ entry->event_id,
+ sizeof(int), sizeof(long),
+ sizeof(void *), sizeof(size_t),
+ ltt_get_alignment());
+ if (entry->format)
+ __trace_mark(0, metadata,
+ core_marker_format,
+ &call_data,
+ "channel %s name %s format %s",
+ entry->channel,
+ entry->name,
+ entry->format);
+ }
+ }
+ mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_dump_marker_state);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94b..2767c8eaf12 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
#include <linux/pfn.h>
+#include <trace/kernel.h>
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -99,7 +100,9 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete uses stop_machine/add uses RCU list operations).
+ * Sorted by ascending list node address.
+ */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -120,6 +123,9 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
* Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+DEFINE_TRACE(kernel_module_load);
+DEFINE_TRACE(kernel_module_free);
+
int register_module_notifier(struct notifier_block * nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -1675,6 +1681,7 @@ static inline void unset_section_ro_nx(struct module *mod, void *module_region)
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
+ trace_kernel_module_free(mod);
trace_module_free(mod);
/* Delete from various lists */
@@ -2272,6 +2279,12 @@ static int copy_and_check(struct load_info *info,
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
return -ENOMEM;
+ /*
+ * Make sure the module text or data access never generates any page
+ * fault.
+ */
+ vmalloc_sync_all();
+
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
goto free_hdr;
@@ -2459,6 +2472,10 @@ static void find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->ctors), &mod->num_ctors);
#endif
+#ifdef CONFIG_MARKERS
+ mod->markers = section_objs(info, "__markers",
+ sizeof(*mod->markers), &mod->num_markers);
+#endif
#ifdef CONFIG_TRACEPOINTS
mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
sizeof(*mod->tracepoints_ptrs),
@@ -2717,7 +2734,7 @@ static struct module *load_module(void __user *umod,
const char __user *uargs)
{
struct load_info info = { NULL, };
- struct module *mod;
+ struct module *mod, *iter;
long err;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2799,7 +2816,23 @@ static struct module *load_module(void __user *umod,
goto ddebug;
module_bug_finalize(info.hdr, info.sechdrs, mod);
+ /*
+ * We sort the modules by struct module pointer address to permit
+ * correct iteration over modules of, at least, kallsyms for preemptible
+ * operations, such as read(). Sorting by struct module pointer address
+ * is equivalent to sort by list node address.
+ */
+ list_for_each_entry_reverse(iter, &modules, list) {
+ BUG_ON(iter == mod); /* Should never be in the list twice */
+ if (iter < mod) {
+ /* We belong to the location right after iter. */
+ list_add_rcu(&mod->list, &iter->list);
+ goto module_added;
+ }
+ }
+ /* We should be added at the head of the list */
list_add_rcu(&mod->list, &modules);
+module_added:
mutex_unlock(&module_mutex);
/* Module is ready to execute: parsing args may do that. */
@@ -2817,6 +2850,7 @@ static struct module *load_module(void __user *umod,
free_copy(&info);
/* Done! */
+ trace_kernel_module_load(mod);
trace_module_load(mod);
return mod;
@@ -3196,12 +3230,12 @@ static char *module_flags(struct module *mod, char *buf)
static void *m_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
+ return seq_sorted_list_start(&modules, pos);
}
static void *m_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &modules, pos);
+ return seq_sorted_list_next(p, &modules, pos);
}
static void m_stop(struct seq_file *m, void *p)
@@ -3266,6 +3300,27 @@ static int __init proc_modules_init(void)
module_init(proc_modules_init);
#endif
+void list_modules(void *call_data)
+{
+ /* Enumerate loaded modules */
+ struct list_head *i;
+ struct module *mod;
+ unsigned long refcount = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each(i, &modules) {
+ mod = list_entry(i, struct module, list);
+#ifdef CONFIG_MODULE_UNLOAD
+ refcount = module_refcount(mod);
+#endif
+ __trace_mark(0, module_state, list_module, call_data,
+ "name %s state %d refcount %lu",
+ mod->name, mod->state, refcount);
+ }
+ mutex_unlock(&module_mutex);
+}
+EXPORT_SYMBOL_GPL(list_modules);
+
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
@@ -3393,12 +3448,59 @@ void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
+ struct marker *marker,
struct tracepoint * const *tp)
{
}
EXPORT_SYMBOL(module_layout);
#endif
+#ifdef CONFIG_MARKERS
+void module_update_markers(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list)
+ if (!(mod->taints & TAINT_FORCED_MODULE))
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_markers(struct marker_iter *iter)
+{
+ struct module *iter_mod;
+ int found = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(iter_mod, &modules, list) {
+ if (!(iter_mod->taints & TAINT_FORCED_MODULE)) {
+ /*
+ * Sorted module list
+ */
+ if (iter_mod < iter->module)
+ continue;
+ else if (iter_mod > iter->module)
+ iter->marker = NULL;
+ found = marker_get_iter_range(&iter->marker,
+ iter_mod->markers,
+ iter_mod->markers + iter_mod->num_markers);
+ if (found) {
+ iter->module = iter_mod;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&module_mutex);
+ return found;
+}
+#endif
+
#ifdef CONFIG_TRACEPOINTS
void module_update_tracepoints(void)
{
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb56..e8481427153 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,6 +5,7 @@
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
+#include <linux/idle.h>
/*
* Notifier list for kernel code which wants to be called
@@ -148,7 +149,7 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_unregister(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
- synchronize_rcu();
+ synchronize_sched();
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
@@ -178,9 +179,9 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
{
int ret;
- rcu_read_lock();
+ rcu_read_lock_sched_notrace();
ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
- rcu_read_unlock();
+ rcu_read_unlock_sched_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
@@ -584,3 +585,27 @@ int unregister_die_notifier(struct notifier_block *nb)
return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+/*
+ * Trace last event before calling notifiers. Notifiers flush data from buffers
+ * before going to idle.
+ */
+int notrace notify_idle(enum idle_val val)
+{
+ return atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(notify_idle);
+
+void register_idle_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(register_idle_notifier);
+
+void unregister_idle_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(unregister_idle_notifier);
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a170..3fd05f5708c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,9 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/dmi.h>
+#include <trace/kernel.h>
+
+DEFINE_TRACE(kernel_panic);
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -64,6 +67,10 @@ NORET_TYPE void panic(const char * fmt, ...)
long i, i_next = 0;
int state = 0;
+ va_start(args, fmt);
+ trace_kernel_panic(fmt, args);
+ va_end(args);
+
/*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22..99373ab79a4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -40,6 +40,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/rculist.h>
+#include <trace/kernel.h>
#include <asm/uaccess.h>
@@ -67,6 +68,7 @@ int console_printk[4] = {
MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
+EXPORT_SYMBOL_GPL(console_printk);
/*
* Low level drivers may need that to know if they can schedule in
@@ -136,6 +138,9 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+DEFINE_TRACE(kernel_printk);
+DEFINE_TRACE(kernel_vprintk);
+
#ifdef CONFIG_PRINTK
static char __log_buf[__LOG_BUF_LEN];
@@ -650,6 +655,7 @@ asmlinkage int printk(const char *fmt, ...)
}
#endif
va_start(args, fmt);
+ trace_kernel_printk(_RET_IP_);
r = vprintk(fmt, args);
va_end(args);
@@ -773,6 +779,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf) - printed_len, fmt, args);
+ trace_kernel_vprintk(_RET_IP_, printk_buf, printed_len);
p = printk_buf;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8..a86e46b6bc1 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,7 @@
#include <linux/mutex.h>
#include <linux/time.h>
#include <linux/kernel_stat.h>
+#include <trace/rcu.h>
#include "rcutree.h"
@@ -145,6 +146,10 @@ int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
module_param(rcu_cpu_stall_suppress, int, 0644);
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+DEFINE_TRACE(rcu_tree_call_rcu);
+DEFINE_TRACE(rcu_tree_call_rcu_bh);
+DEFINE_TRACE(rcu_tree_callback);
+
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
static int rcu_pending(int cpu);
@@ -1143,6 +1148,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ trace_rcu_tree_callback(list);
list->func(list);
list = next;
if (++count >= rdp->blimit)
@@ -1488,6 +1494,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
*/
void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
+ trace_rcu_tree_call_rcu_bh(head, _RET_IP_);
__call_rcu(head, func, &rcu_bh_state);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
diff --git a/kernel/sched.c b/kernel/sched.c
index c164920c8ce..936fd7bc449 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9255,3 +9255,57 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
+static DEFINE_MUTEX(kernel_trace_mutex);
+static int kernel_trace_refcount;
+
+/**
+ * clear_kernel_trace_flag_all_tasks - clears all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to clear their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish clearing
+ * the thread flags, all threads have their flags cleared.
+ */
+void clear_kernel_trace_flag_all_tasks(void)
+{
+ struct task_struct *p;
+ struct task_struct *t;
+
+ mutex_lock(&kernel_trace_mutex);
+ if (--kernel_trace_refcount)
+ goto end;
+ read_lock(&tasklist_lock);
+ do_each_thread(p, t) {
+ clear_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+ } while_each_thread(p, t);
+ read_unlock(&tasklist_lock);
+end:
+ mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(clear_kernel_trace_flag_all_tasks);
+
+/**
+ * set_kernel_trace_flag_all_tasks - sets all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to set their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish setting
+ * the thread flags, all threads have their flags set.
+ */
+void set_kernel_trace_flag_all_tasks(void)
+{
+ struct task_struct *p;
+ struct task_struct *t;
+
+ mutex_lock(&kernel_trace_mutex);
+ if (kernel_trace_refcount++)
+ goto end;
+ read_lock(&tasklist_lock);
+ do_each_thread(p, t) {
+ set_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+ } while_each_thread(p, t);
+ read_unlock(&tasklist_lock);
+end:
+ mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(set_kernel_trace_flag_all_tasks);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec38..a25bf611d13 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,7 +23,10 @@
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
+#include <linux/marker.h>
+#include <linux/kallsyms.h>
#include <linux/tick.h>
+#include <trace/irq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -54,6 +57,20 @@ EXPORT_SYMBOL(irq_stat);
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
+void ltt_dump_softirq_vec(void *call_data)
+{
+ int i;
+ char namebuf[KSYM_NAME_LEN];
+
+ for (i = 0; i < 32; i++) {
+ sprint_symbol(namebuf, (unsigned long)softirq_vec[i].action);
+ __trace_mark(0, softirq_state, softirq_vec, call_data,
+ "id %d address %p symbol %s",
+ i, softirq_vec[i].action, namebuf);
+ }
+}
+EXPORT_SYMBOL_GPL(ltt_dump_softirq_vec);
+
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
@@ -61,6 +78,11 @@ char *softirq_to_name[NR_SOFTIRQS] = {
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
+DEFINE_TRACE(irq_tasklet_high_entry);
+DEFINE_TRACE(irq_tasklet_high_exit);
+DEFINE_TRACE(irq_tasklet_low_entry);
+DEFINE_TRACE(irq_tasklet_low_exit);
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
@@ -341,6 +363,7 @@ void irq_exit(void)
*/
inline void raise_softirq_irqoff(unsigned int nr)
{
+ trace_softirq_raise(nr);
__raise_softirq_irqoff(nr);
/*
@@ -440,7 +463,9 @@ static void tasklet_action(struct softirq_action *a)
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
+ trace_irq_tasklet_low_entry(t);
t->func(t->data);
+ trace_irq_tasklet_low_exit(t);
tasklet_unlock(t);
continue;
}
@@ -475,7 +500,9 @@ static void tasklet_hi_action(struct softirq_action *a)
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
+ trace_irq_tasklet_high_entry(t);
t->func(t->data);
+ trace_irq_tasklet_high_exit(t);
tasklet_unlock(t);
continue;
}
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06..dbaa0648631 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
+obj-$(CONFIG_HAVE_UNSYNCHRONIZED_TSC) += tsc-sync.o
diff --git a/kernel/time/tsc-sync.c b/kernel/time/tsc-sync.c
new file mode 100644
index 00000000000..2ac1544ee22
--- /dev/null
+++ b/kernel/time/tsc-sync.c
@@ -0,0 +1,313 @@
+/*
+ * kernel/time/tsc-sync.c
+ *
+ * Test TSC synchronization
+ *
+ * marks the tsc as unstable _and_ keep a simple "_tsc_is_sync" variable, which
+ * is fast to read when a simple test must determine which clock source to use
+ * for kernel tracing.
+ *
+ * - CPU init :
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ * protects against more than 2 CPUs entering this code.
+ *
+ * - When CPUs are up :
+ *
+ * TSC synchronicity of all CPUs can be checked later at run-time by calling
+ * test_tsc_synchronization().
+ *
+ * Copyright 2007, 2008
+ * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ */
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/trace-clock.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#define MAX_CYCLES_DELTA 3000ULL
+
+/*
+ * Number of loops to take care of MCE, NMIs, SMIs.
+ */
+#define NR_LOOPS 200
+
+static DEFINE_MUTEX(tscsync_mutex);
+
+struct sync_data {
+ int nr_waits;
+ int wait_sync;
+ cycles_t tsc_count;
+} ____cacheline_aligned;
+
+/* 0 is master, 1 is slave */
+static struct sync_data sync_data[2] = {
+ [0 ... 1] = {
+ .nr_waits = 3 * NR_LOOPS + 1,
+ .wait_sync = 3 * NR_LOOPS + 1,
+ },
+};
+
+int _tsc_is_sync = 1;
+EXPORT_SYMBOL(_tsc_is_sync);
+
+static int force_tsc_sync;
+static cycles_t slave_offset;
+static int slave_offset_ready; /* for 32-bits architectures */
+
+static int __init force_tsc_sync_setup(char *str)
+{
+ force_tsc_sync = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("force_tsc_sync=", force_tsc_sync_setup);
+
+/*
+ * Mark it noinline so we make sure it is not unrolled.
+ * Wait until value is reached.
+ */
+static noinline void tsc_barrier(long this_cpu)
+{
+ sync_core();
+ sync_data[this_cpu].wait_sync--;
+ smp_mb(); /* order master/slave sync_data read/write */
+ while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+ sync_data[this_cpu].nr_waits))
+ barrier(); /*
+ * barrier is used because faster and
+ * more predictable than cpu_idle().
+ */
+ smp_mb(); /* order master/slave sync_data read/write */
+ sync_data[this_cpu].nr_waits--;
+ get_cycles_barrier();
+ sync_data[this_cpu].tsc_count = get_cycles();
+ get_cycles_barrier();
+}
+
+/*
+ * Worker thread called on each CPU.
+ * First wait with interrupts enabled, then wait with interrupt disabled,
+ * for precision. We are already bound to one CPU.
+ * this_cpu 0 : master
+ * this_cpu 1 : slave
+ */
+static void test_sync(void *arg)
+{
+ long this_cpu = (long)arg;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Make sure the instructions are in I-CACHE */
+ tsc_barrier(this_cpu);
+ tsc_barrier(this_cpu);
+ sync_data[this_cpu].wait_sync--;
+ smp_mb(); /* order master/slave sync_data read/write */
+ while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+ sync_data[this_cpu].nr_waits))
+ barrier(); /*
+ * barrier is used because faster and
+ * more predictable than cpu_idle().
+ */
+ smp_mb(); /* order master/slave sync_data read/write */
+ sync_data[this_cpu].nr_waits--;
+ /*
+ * Here, only the master will wait for the slave to reach this barrier.
+ * This makes sure that the master, which holds the mutex and will reset
+ * the barriers, waits for the slave to stop using the barrier values
+ * before it continues. This is only done at the complete end of all the
+ * loops. This is why there is a + 1 in original wait_sync value.
+ */
+ if (sync_data[this_cpu].nr_waits == 1)
+ sync_data[this_cpu].wait_sync--;
+ local_irq_restore(flags);
+}
+
+/*
+ * Each CPU (master and target) must decrement the wait_sync value twice (one
+ * for priming in cache), and also once after the get_cycles. After all the
+ * loops, one last synchronization is required to make sure the master waits
+ * for the slave before resetting the barriers.
+ */
+static void reset_barriers(void)
+{
+ int i;
+
+ /*
+ * Wait until slave is done so that we don't overwrite
+ * wait_end_sync prematurely.
+ */
+ smp_mb(); /* order master/slave sync_data read/write */
+ while (unlikely(sync_data[1].wait_sync >= sync_data[0].nr_waits))
+ barrier(); /*
+ * barrier is used because faster and
+ * more predictable than cpu_idle().
+ */
+ smp_mb(); /* order master/slave sync_data read/write */
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(sync_data[i].wait_sync != 0);
+ WARN_ON(sync_data[i].nr_waits != 1);
+ sync_data[i].wait_sync = 3 * NR_LOOPS + 1;
+ sync_data[i].nr_waits = 3 * NR_LOOPS + 1;
+ }
+}
+
+/*
+ * Do loops (making sure no unexpected event changes the timing), keep the best
+ * one. The result of each loop is the highest tsc delta between the master CPU
+ * and the slaves. Stop CPU hotplug when this code is executed to make sure we
+ * are concurrency-safe wrt CPU hotplug also using this code. Test TSC
+ * synchronization even if we already "know" CPUs were not synchronized. This
+ * can be used as a test to check if, for some reason, the CPUs eventually got
+ * in sync after a CPU has been unplugged. This code is kept separate from the
+ * CPU hotplug code because the slave CPU executes in an IPI, which we want to
+ * keep as short as possible (this is happening while the system is running).
+ * Therefore, we do not send a single IPI for all the test loops, but rather
+ * send one IPI per loop.
+ */
+int test_tsc_synchronization(void)
+{
+ long cpu, master;
+ cycles_t max_diff = 0, diff, best_loop, worse_loop = 0;
+ int i;
+
+ mutex_lock(&tscsync_mutex);
+ get_online_cpus();
+
+ printk(KERN_INFO
+ "checking TSC synchronization across all online CPUs:");
+
+ preempt_disable();
+ master = smp_processor_id();
+ for_each_online_cpu(cpu) {
+ if (master == cpu)
+ continue;
+ best_loop = (cycles_t)ULLONG_MAX;
+ for (i = 0; i < NR_LOOPS; i++) {
+ smp_call_function_single(cpu, test_sync,
+ (void *)1UL, 0);
+ test_sync((void *)0UL);
+ diff = abs(sync_data[1].tsc_count
+ - sync_data[0].tsc_count);
+ best_loop = min(best_loop, diff);
+ worse_loop = max(worse_loop, diff);
+ }
+ reset_barriers();
+ max_diff = max(best_loop, max_diff);
+ }
+ preempt_enable();
+ if (max_diff >= MAX_CYCLES_DELTA) {
+ printk(KERN_WARNING
+ "Measured %llu cycles TSC offset between CPUs,"
+ " turning off TSC clock.\n", (u64)max_diff);
+ mark_tsc_unstable("check_tsc_sync_source failed");
+ _tsc_is_sync = 0;
+ } else {
+ printk(" passed.\n");
+ }
+ put_online_cpus();
+ mutex_unlock(&tscsync_mutex);
+ return max_diff < MAX_CYCLES_DELTA;
+}
+EXPORT_SYMBOL_GPL(test_tsc_synchronization);
+
+/*
+ * Test synchronicity of a single core when it is hotplugged.
+ * Source CPU calls into this - waits for the freshly booted target CPU to
+ * arrive and then start the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+ cycles_t diff, abs_diff,
+ best_loop = (cycles_t)ULLONG_MAX, worse_loop = 0;
+ int i;
+
+ /*
+ * No need to check if we already know that the TSC is not synchronized:
+ */
+ if (!force_tsc_sync && unsynchronized_tsc()) {
+ /*
+ * Make sure we mark _tsc_is_sync to 0 if the TSC is found
+ * to be unsynchronized for other causes than non-synchronized
+ * TSCs across CPUs.
+ */
+ _tsc_is_sync = 0;
+ set_trace_clock_is_sync(0);
+ return;
+ }
+
+ printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+ smp_processor_id(), cpu);
+
+ for (i = 0; i < NR_LOOPS; i++) {
+ test_sync((void *)0UL);
+ diff = sync_data[1].tsc_count - sync_data[0].tsc_count;
+ abs_diff = abs(diff);
+ best_loop = min(best_loop, abs_diff);
+ worse_loop = max(worse_loop, abs_diff);
+ if (force_tsc_sync && best_loop == abs_diff)
+ slave_offset = diff;
+ }
+ reset_barriers();
+
+ if (!force_tsc_sync && best_loop >= MAX_CYCLES_DELTA) {
+ printk(" failed.\n");
+ printk(KERN_WARNING
+ "Measured %llu cycles TSC offset between CPUs,"
+ " turning off TSC clock.\n", (u64)best_loop);
+ mark_tsc_unstable("check_tsc_sync_source failed");
+ _tsc_is_sync = 0;
+ set_trace_clock_is_sync(0);
+ } else {
+ printk(" %s.\n", !force_tsc_sync ? "passed" : "forced");
+ }
+ if (force_tsc_sync) {
+ /* order slave_offset and slave_offset_ready writes */
+ smp_wmb();
+ slave_offset_ready = 1;
+ }
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+ int i;
+
+ if (!force_tsc_sync && unsynchronized_tsc())
+ return;
+
+ for (i = 0; i < NR_LOOPS; i++)
+ test_sync((void *)1UL);
+
+ /*
+ * Force slave synchronization if requested.
+ */
+ if (force_tsc_sync) {
+ unsigned long flags;
+ cycles_t new_tsc;
+
+ while (!slave_offset_ready)
+ cpu_relax();
+ /* order slave_offset and slave_offset_ready reads */
+ smp_rmb();
+ local_irq_save(flags);
+ /*
+ * slave_offset is read when master has finished writing to it,
+ * and is protected by cpu hotplug serialization.
+ */
+ new_tsc = get_cycles() - slave_offset;
+ write_tsc((u32)new_tsc, (u32)((u64)new_tsc >> 32));
+ local_irq_restore(flags);
+ }
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d24..65cc58ce148 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -40,12 +40,14 @@
#include <linux/irq_work.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <trace/timer.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>
+#include <asm/irq_regs.h>
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
@@ -54,6 +56,10 @@ u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
+DEFINE_TRACE(timer_set);
+DEFINE_TRACE(timer_update_time);
+DEFINE_TRACE(timer_timeout);
+
/*
* per-CPU timer vector definitions:
*/
@@ -366,6 +372,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
}
+ trace_timer_set(timer);
/*
* Timers are FIFO:
*/
@@ -1303,8 +1310,13 @@ void run_local_timers(void)
void do_timer(unsigned long ticks)
{
+ struct timespec curtime, wtom;
+
jiffies_64 += ticks;
update_wall_time();
+ curtime = __current_kernel_time();
+ wtom = __get_wall_to_monotonic();
+ trace_timer_update_time(&curtime, &wtom);
calc_global_load(ticks);
}
@@ -1387,7 +1399,9 @@ SYSCALL_DEFINE0(getegid)
static void process_timeout(unsigned long __data)
{
- wake_up_process((struct task_struct *)__data);
+ struct task_struct *task = (struct task_struct *)__data;
+ trace_timer_timeout(task);
+ wake_up_process(task);
}
/**
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..614d9153a24 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace-clock-32-to-64.o
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace-clock.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/trace-clock-32-to-64.c b/kernel/trace/trace-clock-32-to-64.c
new file mode 100644
index 00000000000..c036f5c5586
--- /dev/null
+++ b/kernel/trace/trace-clock-32-to-64.c
@@ -0,0 +1,296 @@
+/*
+ * kernel/trace/trace-clock-32-to-64.c
+ *
+ * (C) Copyright 2006,2007,2008 -
+ * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Extends a 32 bits clock source to a full 64 bits count, readable atomically
+ * from any execution context.
+ *
+ * notes :
+ * - trace clock 32->64 bits extended timer-based clock cannot be used for early
+ * tracing in the boot process, as it depends on timer interrupts.
+ * - The timer is only on one CPU to support hotplug.
+ * - We have the choice between schedule_delayed_work_on and an IPI to get each
+ * CPU to write the heartbeat. IPI has been chosen because it is considered
+ * faster than passing through the timer to get the work scheduled on all the
+ * CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* needed due to include order problem on m68k */
+#include <linux/math64.h>
+
+#define HW_BITMASK ((1ULL << TC_HW_BITS) - 1)
+#define HW_LS32(hw) ((hw) & HW_BITMASK)
+#define SW_MS32(sw) ((sw) & ~HW_BITMASK)
+
+static DEFINE_SPINLOCK(synthetic_tsc_lock);
+static int synthetic_tsc_refcount; /* Number of readers */
+static int synthetic_tsc_enabled; /* synth. TSC enabled on all online CPUs */
+
+static DEFINE_PER_CPU(struct timer_list, tsc_timer);
+static unsigned int precalc_expire;
+
+struct synthetic_tsc_struct {
+ union {
+ u64 val;
+ struct {
+#ifdef __BIG_ENDIAN
+ u32 ms32;
+ u32 ls32;
+#else
+ u32 ls32;
+ u32 ms32;
+#endif
+ } sel;
+ } tsc[2];
+ unsigned int index; /* Index of the current synth. tsc. */
+};
+
+static DEFINE_PER_CPU(struct synthetic_tsc_struct, synthetic_tsc);
+
+/* Called from IPI or timer interrupt */
+static void update_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u32 tsc;
+
+ cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+ tsc = trace_clock_read32(); /* Hardware clocksource read */
+
+ if (tsc < HW_LS32(cpu_synth->tsc[cpu_synth->index].sel.ls32)) {
+ unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+ /*
+ * Overflow
+ * Non atomic update of the non current synthetic TSC, followed
+ * by an atomic index change. There is no write concurrency,
+ * so the index read/write does not need to be atomic.
+ */
+ cpu_synth->tsc[new_index].val =
+ (SW_MS32(cpu_synth->tsc[cpu_synth->index].val)
+ | (u64)tsc) + (1ULL << TC_HW_BITS);
+ /*
+ * Ensure the compiler does not reorder index write. It makes
+ * sure all nested interrupts will see the new value before the
+ * new index is written.
+ */
+ barrier();
+ cpu_synth->index = new_index; /* atomic change of index */
+ } else {
+ /*
+ * No overflow : We know that the only bits changed are
+ * contained in the 32 LS32s, which can be written to atomically.
+ */
+ cpu_synth->tsc[cpu_synth->index].sel.ls32 =
+ SW_MS32(cpu_synth->tsc[cpu_synth->index].sel.ls32) | tsc;
+ }
+}
+
+/*
+ * Should only be called when interrupts are off. Affects only current CPU.
+ */
+void _trace_clock_write_synthetic_tsc(u64 value)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ unsigned int new_index;
+
+ cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+ new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+ cpu_synth->tsc[new_index].val = value;
+ barrier();
+ cpu_synth->index = new_index; /* atomic change of index */
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 notrace trace_clock_read_synthetic_tsc(void)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 ret;
+ unsigned int index;
+ u32 tsc;
+
+ preempt_disable_notrace();
+ cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+ index = ACCESS_ONCE(cpu_synth->index); /* atomic read */
+ tsc = trace_clock_read32(); /* Hardware clocksource read */
+
+ /* Overflow detection */
+ if (unlikely(tsc < HW_LS32(cpu_synth->tsc[index].sel.ls32)))
+ ret = (SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc)
+ + (1ULL << TC_HW_BITS);
+ else
+ ret = SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_clock_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+ update_synthetic_tsc();
+}
+
+/*
+ * tsc_timer_fct : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void tsc_timer_fct(unsigned long data)
+{
+ update_synthetic_tsc();
+
+ mod_timer_pinned(&per_cpu(tsc_timer, smp_processor_id()),
+ jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the clock
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+ u64 rem_freq, rem_interval;
+
+ precalc_expire =
+ __iter_div_u64_rem(HW_BITMASK, (
+ __iter_div_u64_rem(trace_clock_frequency(),
+ HZ * trace_clock_freq_scale(), &rem_freq) << 1
+ )
+ - 1
+ - (TC_EXPECTED_INTERRUPT_LATENCY * HZ / 1000), &rem_interval)
+ >> 1;
+ WARN_ON(precalc_expire == 0);
+ printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+ precalc_expire);
+ return 0;
+}
+
+static void prepare_synthetic_tsc(int cpu)
+{
+ struct synthetic_tsc_struct *cpu_synth;
+ u64 local_count;
+
+ cpu_synth = &per_cpu(synthetic_tsc, cpu);
+ local_count = trace_clock_read_synthetic_tsc();
+ cpu_synth->tsc[0].val = local_count;
+ cpu_synth->index = 0;
+ smp_wmb(); /* Writing in data of CPU about to come up */
+ init_timer_deferrable(&per_cpu(tsc_timer, cpu));
+ per_cpu(tsc_timer, cpu).function = tsc_timer_fct;
+ per_cpu(tsc_timer, cpu).expires = jiffies + precalc_expire;
+}
+
+static void enable_synthetic_tsc(int cpu)
+{
+ smp_call_function_single(cpu, synthetic_tsc_ipi, NULL, 1);
+ add_timer_on(&per_cpu(tsc_timer, cpu), cpu);
+}
+
+static void disable_synthetic_tsc(int cpu)
+{
+ del_timer_sync(&per_cpu(tsc_timer, cpu));
+}
+
+/*
+ * hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Sets the new CPU's current synthetic TSC to the same value as the
+ * currently running CPU.
+ *
+ * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ spin_lock(&synthetic_tsc_lock);
+ if (synthetic_tsc_refcount)
+ prepare_synthetic_tsc(hotcpu);
+ spin_unlock(&synthetic_tsc_lock);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ spin_lock(&synthetic_tsc_lock);
+ if (synthetic_tsc_refcount)
+ enable_synthetic_tsc(hotcpu);
+ spin_unlock(&synthetic_tsc_lock);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ spin_lock(&synthetic_tsc_lock);
+ if (synthetic_tsc_refcount)
+ disable_synthetic_tsc(hotcpu);
+ spin_unlock(&synthetic_tsc_lock);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+void get_synthetic_tsc(void)
+{
+ int cpu;
+
+ spin_lock(&synthetic_tsc_lock);
+ if (synthetic_tsc_refcount++)
+ goto end;
+
+ synthetic_tsc_enabled = 1;
+ for_each_online_cpu(cpu) {
+ prepare_synthetic_tsc(cpu);
+ enable_synthetic_tsc(cpu);
+ }
+end:
+ spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(get_synthetic_tsc);
+
+void put_synthetic_tsc(void)
+{
+ int cpu;
+
+ spin_lock(&synthetic_tsc_lock);
+ WARN_ON(synthetic_tsc_refcount <= 0);
+ if (synthetic_tsc_refcount != 1 || !synthetic_tsc_enabled)
+ goto end;
+
+ for_each_online_cpu(cpu)
+ disable_synthetic_tsc(cpu);
+ synthetic_tsc_enabled = 0;
+end:
+ synthetic_tsc_refcount--;
+ spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(put_synthetic_tsc);
+
+/* Called from CPU 0, before any tracing starts, to init each structure */
+static int __init init_synthetic_tsc(void)
+{
+ precalc_stsc_interval();
+ hotcpu_notifier(hotcpu_callback, 3);
+ return 0;
+}
+
+/* Before SMP is up */
+/* workaround for omap4 */
+late_initcall(init_synthetic_tsc);
diff --git a/kernel/trace/trace-clock.c b/kernel/trace/trace-clock.c
new file mode 100644
index 00000000000..3ed1667aacb
--- /dev/null
+++ b/kernel/trace/trace-clock.c
@@ -0,0 +1,97 @@
+/*
+ * kernel/trace/trace-clock.c
+ *
+ * (C) Copyright 2008 -
+ * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Generic kernel tracing clock for architectures without TSC.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/jiffies.h>
+
+static int trace_clock_refcount;
+static DEFINE_MUTEX(trace_clock_mutex);
+static struct timer_list trace_clock_timer;
+/*
+ * bits 0..12 : counter, atomically incremented
+ * bits 13..{32,64} : time counter, incremented each jiffy.
+ */
+atomic_long_t trace_clock_var;
+EXPORT_SYMBOL(trace_clock_var);
+
+static void trace_clock_update(void)
+{
+ long old_clock, new_clock;
+ unsigned long ticks;
+
+ /*
+ * Make sure we keep track of delayed timer.
+ */
+ ticks = jiffies - trace_clock_timer.expires + 1;
+ /* Don't update if ticks is zero, time would go backward. */
+ if (unlikely(!ticks))
+ return;
+ do {
+ old_clock = atomic_long_read(&trace_clock_var);
+ new_clock = (old_clock + (ticks << TRACE_CLOCK_SHIFT))
+ & (~((1 << TRACE_CLOCK_SHIFT) - 1));
+ } while (atomic_long_cmpxchg(&trace_clock_var, old_clock, new_clock)
+ != old_clock);
+}
+
+static void trace_clock_timer_fct(unsigned long data)
+{
+ trace_clock_update();
+ trace_clock_timer.expires = jiffies + 1;
+ add_timer(&trace_clock_timer);
+}
+
+static void enable_trace_clock(void)
+{
+ init_timer(&trace_clock_timer);
+ /* trace_clock_update() reads expires */
+ trace_clock_timer.function = trace_clock_timer_fct;
+ trace_clock_timer.expires = jiffies + 1;
+ trace_clock_update();
+ add_timer(&trace_clock_timer);
+}
+
+static void disable_trace_clock(void)
+{
+ del_timer_sync(&trace_clock_timer);
+}
+
+void get_trace_clock(void)
+{
+ get_synthetic_tsc();
+ mutex_lock(&trace_clock_mutex);
+ if (trace_clock_refcount++)
+ goto end;
+ enable_trace_clock();
+end:
+ mutex_unlock(&trace_clock_mutex);
+}
+EXPORT_SYMBOL_GPL(get_trace_clock);
+
+void put_trace_clock(void)
+{
+ mutex_lock(&trace_clock_mutex);
+ WARN_ON(trace_clock_refcount <= 0);
+ if (trace_clock_refcount != 1)
+ goto end;
+ disable_trace_clock();
+end:
+ trace_clock_refcount--;
+ mutex_unlock(&trace_clock_mutex);
+ put_synthetic_tsc();
+}
+EXPORT_SYMBOL_GPL(put_trace_clock);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf..687699d365a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,6 +11,7 @@
#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/module.h>
+#include <linux/marker.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
#include <linux/list.h>