From 82f67cd9fca8c8762c15ba7ed0d5747588c1e221 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 Feb 2007 01:28:13 -0800 Subject: [PATCH] Add debugging feature /proc/timer_stat Add /proc/timer_stats support: debugging feature to profile timer expiration. Both the starting site, process/PID and the expiration function is captured. This allows the quick identification of timer event sources in a system. Sample output: # echo 1 > /proc/timer_stats # cat /proc/timer_stats Timer Stats Version: v0.1 Sample period: 4.010 s 24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) 11, 0 swapper sk_reset_timer (tcp_delack_timer) 6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) 2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) 17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick) 2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) 4, 2050 pcscd do_nanosleep (hrtimer_wakeup) 5, 4179 sshd sk_reset_timer (tcp_write_timer) 4, 2248 yum-updatesd schedule_timeout (process_timeout) 18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick) 3, 0 swapper sk_reset_timer (tcp_delack_timer) 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) 2, 1 swapper e1000_up (e1000_watchdog) 1, 1 init schedule_timeout (process_timeout) 100 total events, 25.24 events/sec [ cleanups and hrtimers support from Thomas Gleixner ] [bunk@stusta.de: nr_entries can become static] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: john stultz Cc: Roman Zippel Cc: Andi Kleen Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 26 +++ kernel/time/Makefile | 1 + kernel/time/timer_stats.c | 411 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 31 +++- kernel/workqueue.c | 7 +- 5 files changed, 472 insertions(+), 4 deletions(-) create mode 100644 kernel/time/timer_stats.c (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 62aad8e1a383..476cb0c0b4a4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -585,6 +585,18 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +#ifdef CONFIG_TIMER_STATS +void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /* * Counterpart to lock_timer_base above: */ @@ -743,6 +755,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ + timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram); @@ -791,6 +804,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) } timer->expires = tim; + timer_stats_hrtimer_set_start_info(timer); + enqueue_hrtimer(timer, new_base, base == new_base); unlock_hrtimer_base(timer, &flags); @@ -925,6 +940,12 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->base = &cpu_base->clock_base[clock_id]; hrtimer_init_timer_hres(timer); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1006,6 +1027,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); /* * Note: We clear the CALLBACK bit after @@ -1050,6 +1072,8 @@ static void run_hrtimer_softirq(struct softirq_action *h) timer = list_entry(cpu_base->cb_pending.next, struct hrtimer, cb_entry); + timer_stats_account_hrtimer(timer); + fn = timer->function; __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); spin_unlock_irq(&cpu_base->lock); @@ -1106,6 +1130,8 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, if (base->softirq_time.tv64 <= timer->expires.tv64) break; + timer_stats_account_hrtimer(timer); + fn = timer->function; __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); spin_unlock_irq(&cpu_base->lock); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f246bc836b9a..252b4b5d9dd0 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 000000000000..1bc4882e28e0 --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,411 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo 1[0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(spinlock_t, lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +static int __read_mostly active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + memcpy(curr->comm, comm, TASK_COMM_LEN); + if (prev) + prev->next = curr; + else + *head = curr; + curr->next = NULL; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm) +{ + /* + * It doesnt matter which lock we take: + */ + spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + struct entry *entry, input; + unsigned long flags; + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + + spin_lock_irqsave(lock, flags); + if (!active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + seq_printf(m, "%4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, + events / period.tv_sec, events * 1000 / ms); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + /* nothing */ + spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (active) { + active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!active) { + reset_entries(); + time_start = ktime_get(); + active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &tstats_fops; + + return 0; +} +__initcall(init_tstats_procfs); diff --git a/kernel/timer.c b/kernel/timer.c index f058e6cfd50c..c3c67f3c0190 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -263,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -274,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); static inline void detach_timer(struct timer_list *timer, - int clear_pending) + int clear_pending) { struct list_head *entry = &timer->entry; @@ -325,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) unsigned long flags; int ret = 0; + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -375,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -407,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); + timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -437,6 +458,7 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -570,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) fn = timer->function; data = timer->data; + timer_stats_account_timer(timer); + set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -1229,7 +1253,8 @@ static void run_timer_softirq(struct softirq_action *h) { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1675,6 +1700,8 @@ void __init init_timers(void) int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + init_timer_stats(); + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 020d1fff57dc..b6fa5e63085d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct workqueue_struct *wq = get_wq_data(&dwork->work); @@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(timer); if (delay == 0) return queue_work(wq, work); @@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); -- cgit v1.2.3