/* * arch/x86/kernel/trace-clock.c * * Trace clock for x86. * * Mathieu Desnoyers , October 2008 */ #include #include #include #include #include #include #include static cycles_t trace_clock_last_tsc; static DEFINE_PER_CPU(struct timer_list, update_timer); static DEFINE_SPINLOCK(async_tsc_lock); static int async_tsc_refcount; /* Number of readers */ static int async_tsc_enabled; /* Async TSC enabled on all online CPUs */ int _trace_clock_is_sync = 1; EXPORT_SYMBOL_GPL(_trace_clock_is_sync); /* * Is the trace clock being used by user-space ? We leave the trace clock active * as soon as user-space starts using it. We never unref the trace clock * reference taken by user-space. */ static atomic_t user_trace_clock_ref; /* * Called by check_tsc_sync_source from CPU hotplug. */ void set_trace_clock_is_sync(int state) { _trace_clock_is_sync = state; update_trace_clock_is_sync_vdso(); } #if BITS_PER_LONG == 64 static cycles_t read_last_tsc(void) { return trace_clock_last_tsc; } #else /* * A cmpxchg64 update can happen concurrently. Based on the assumption that * two cmpxchg64 will never update it to the same value (the count always * increases), reading it twice insures that we read a coherent value with the * same "sequence number". */ static cycles_t read_last_tsc(void) { cycles_t val1, val2; val1 = trace_clock_last_tsc; for (;;) { val2 = val1; barrier(); val1 = trace_clock_last_tsc; if (likely(val1 == val2)) break; } return val1; } #endif /* * Support for architectures with non-sync TSCs. * When the local TSC is discovered to lag behind the highest TSC counter, we * increment the TSC count of an amount that should be, ideally, lower than the * execution time of this routine, in cycles : this is the granularity we look * for : we must be able to order the events. */ notrace cycles_t trace_clock_async_tsc_read(void) { cycles_t new_tsc, last_tsc; WARN_ON(!async_tsc_refcount || !async_tsc_enabled); new_tsc = get_cycles(); last_tsc = read_last_tsc(); do { if (new_tsc < last_tsc) new_tsc = last_tsc + TRACE_CLOCK_MIN_PROBE_DURATION; /* * If cmpxchg fails with a value higher than the new_tsc, don't * retry : the value has been incremented and the events * happened almost at the same time. * We must retry if cmpxchg fails with a lower value : * it means that we are the CPU with highest frequency and * therefore MUST update the value. */ last_tsc = cmpxchg64(&trace_clock_last_tsc, last_tsc, new_tsc); } while (unlikely(last_tsc < new_tsc)); return new_tsc; } EXPORT_SYMBOL_GPL(trace_clock_async_tsc_read); static void update_timer_ipi(void *info) { (void)trace_clock_async_tsc_read(); } /* * update_timer_fct : - Timer function to resync the clocks * @data: unused * * Fires every jiffy. */ static void update_timer_fct(unsigned long data) { (void)trace_clock_async_tsc_read(); mod_timer_pinned(&per_cpu(update_timer, smp_processor_id()), jiffies + 1); } static void enable_trace_clock(int cpu) { init_timer(&per_cpu(update_timer, cpu)); per_cpu(update_timer, cpu).function = update_timer_fct; per_cpu(update_timer, cpu).expires = jiffies + 1; smp_call_function_single(cpu, update_timer_ipi, NULL, 1); add_timer_on(&per_cpu(update_timer, cpu), cpu); } static void disable_trace_clock(int cpu) { del_timer_sync(&per_cpu(update_timer, cpu)); } /* * hotcpu_callback - CPU hotplug callback * @nb: notifier block * @action: hotplug action to take * @hcpu: CPU number * * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) */ static int __cpuinit hotcpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int hotcpu = (unsigned long)hcpu; int cpu; spin_lock(&async_tsc_lock); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: /* * trace_clock_is_sync() is updated by set_trace_clock_is_sync() * code, protected by cpu hotplug disable. * It is ok to let the hotplugged CPU read the timebase before * the CPU_ONLINE notification. It's just there to give a * maximum bound to the TSC error. */ if (async_tsc_refcount && !trace_clock_is_sync()) { if (!async_tsc_enabled) { async_tsc_enabled = 1; for_each_online_cpu(cpu) enable_trace_clock(cpu); } else { enable_trace_clock(hotcpu); } } break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: if (!async_tsc_refcount && num_online_cpus() == 1) set_trace_clock_is_sync(1); break; case CPU_DEAD: case CPU_DEAD_FROZEN: /* * We cannot stop the trace clock on other CPUs when readers are * active even if we go back to a synchronized state (1 CPU) * because the CPU left could be the one lagging behind. */ if (async_tsc_refcount && async_tsc_enabled) disable_trace_clock(hotcpu); if (!async_tsc_refcount && num_online_cpus() == 1) set_trace_clock_is_sync(1); break; #endif /* CONFIG_HOTPLUG_CPU */ } spin_unlock(&async_tsc_lock); return NOTIFY_OK; } int get_trace_clock(void) { int cpu; if (!trace_clock_is_sync()) { printk(KERN_WARNING "Trace clock falls back on cache-line bouncing\n" "workaround due to non-synchronized TSCs.\n" "This workaround preserves event order across CPUs.\n" "Please consider disabling Speedstep or PowerNow and\n" "using kernel parameters " "\"force_tsc_sync=1 idle=poll\"\n" "for accurate and fast tracing clock source.\n"); } get_online_cpus(); spin_lock(&async_tsc_lock); if (async_tsc_refcount++ || trace_clock_is_sync()) goto end; async_tsc_enabled = 1; for_each_online_cpu(cpu) enable_trace_clock(cpu); end: spin_unlock(&async_tsc_lock); put_online_cpus(); return 0; } EXPORT_SYMBOL_GPL(get_trace_clock); void put_trace_clock(void) { int cpu; get_online_cpus(); spin_lock(&async_tsc_lock); WARN_ON(async_tsc_refcount <= 0); if (async_tsc_refcount != 1 || !async_tsc_enabled) goto end; for_each_online_cpu(cpu) disable_trace_clock(cpu); async_tsc_enabled = 0; end: async_tsc_refcount--; if (!async_tsc_refcount && num_online_cpus() == 1) set_trace_clock_is_sync(1); spin_unlock(&async_tsc_lock); put_online_cpus(); } EXPORT_SYMBOL_GPL(put_trace_clock); static int posix_get_trace(clockid_t which_clock, struct timespec *tp) { union lttng_timespec *lts = (union lttng_timespec *) tp; int ret; /* * Yes, there is a race here that would lead to refcount being * incremented more than once, but all we care is to leave the trace * clock active forever, so precise accounting is not needed. */ if (unlikely(!atomic_read(&user_trace_clock_ref))) { ret = get_trace_clock(); if (ret) return ret; atomic_inc(&user_trace_clock_ref); } lts->lttng_ts = trace_clock_read64(); return 0; } static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp) { union lttng_timespec *lts = (union lttng_timespec *) tp; lts->lttng_ts = trace_clock_frequency(); return 0; } static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp) { union lttng_timespec *lts = (union lttng_timespec *) tp; lts->lttng_ts = TRACE_CLOCK_RES; return 0; } static __init int init_unsync_trace_clock(void) { struct k_clock clock_trace = { .clock_getres = posix_get_trace_res, .clock_get = posix_get_trace, }; struct k_clock clock_trace_freq = { .clock_getres = posix_get_trace_res, .clock_get = posix_get_trace_freq, }; register_posix_clock(CLOCK_TRACE, &clock_trace); register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq); hotcpu_notifier(hotcpu_callback, 4); return 0; } early_initcall(init_unsync_trace_clock);