From 13b9b6e746d753d43270a78dd39694912646b5d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Nov 2010 22:19:24 -0500 Subject: tracing: Fix module use of trace_bprintk() On use of trace_printk() there's a macro that determines if the format is static or a variable. If it is static, it defaults to __trace_bprintk() otherwise it uses __trace_printk(). A while ago, Lai Jiangshan added __trace_bprintk(). In that patch, we discussed a way to allow modules to use it. The difference between __trace_bprintk() and __trace_printk() is that for faster processing, just the format and args are stored in the trace instead of running it through a sprintf function. In order to do this, the format used by the __trace_bprintk() had to be persistent. See commit 1ba28e02a18cbdbea123836f6c98efb09cbf59ec The problem comes with trace_bprintk() where the module is unloaded. The pointer left in the buffer is still pointing to the format. To solve this issue, the formats in the module were copied into kernel core. If the same format was used, they would use the same copy (to prevent memory leak). This all worked well until we tried to merge everything. At the time this was written, Lai Jiangshan, Frederic Weisbecker, Ingo Molnar and myself were all touching the same code. When this was merged, we lost the part of it that was in module.c. This kept out the copying of the formats and unloading the module could cause bad pointers left in the ring buffer. This patch adds back (with updates required for current kernel) the module code that sets up the necessary pointers. Cc: Lai Jiangshan Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 437a74a7524..d190664f25f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * mod->num_trace_events, GFP_KERNEL); #endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_bprintk_fmt_start, + sizeof(*mod->trace_bprintk_fmt_start) * + mod->num_trace_bprintk_fmt, GFP_KERNEL); +#endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, "__mcount_loc", -- cgit v1.2.3 From 3c502e7a0255d82621ff25d60cc816624830497e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 4 Nov 2010 17:33:01 -0500 Subject: perf,hw_breakpoint: Initialize hardware api earlier When using early debugging, the kernel does not initialize the hw_breakpoint API early enough and causes the late initialization of the kernel debugger to fail. The boot arguments are: earlyprintk=vga ekgdboc=kbd kgdbwait Then simply type "go" at the kdb prompt and boot. The kernel will later emit the message: kgdb: Could not allocate hwbreakpoints And at that point the kernel debugger will cease to work correctly. The solution is to initialize the hw_breakpoint at the same time that all the other perf call backs are initialized instead of using a core_initcall() initialization which happens well after the kernel debugger can make use of hardware breakpoints. Signed-off-by: Jason Wessel CC: Frederic Weisbecker CC: Ingo Molnar CC: Peter Zijlstra LKML-Reference: <4CD3396D.1090308@windriver.com> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 3 +-- kernel/perf_event.c | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2c9120f0afc..e5325825aeb 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = { .read = hw_breakpoint_pmu_read, }; -static int __init init_hw_breakpoint(void) +int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; int cpu, err_cpu; @@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void) return -ENOMEM; } -core_initcall(init_hw_breakpoint); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..05b7d8c72c6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -6295,6 +6296,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) void __init perf_event_init(void) { + int ret; + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent); @@ -6302,4 +6305,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } -- cgit v1.2.3 From 91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Nov 2010 12:56:12 +0100 Subject: tracing: Fix recursive user stack trace The user stack trace can fault when examining the trace. Which would call the do_page_fault handler, which would trace again, which would do the user stack trace, which would fault and call do_page_fault again ... Thus this is causing a recursive bug. We need to have a recursion detector here. [ Resubmitted by Jiri Olsa ] [ Eric Dumazet recommended using __this_cpu_* instead of __get_cpu_* ] Cc: Eric Dumazet Signed-off-by: Jiri Olsa LKML-Reference: <1289390172-9730-3-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82d9b8106cd..ee6a7339cf0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1284,6 +1284,8 @@ void trace_dump_stack(void) __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); } +static DEFINE_PER_CPU(int, user_stack_count); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (unlikely(in_nmi())) return; + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + __this_cpu_dec(user_stack_count); + + out: + preempt_enable(); } #ifdef UNUSED -- cgit v1.2.3 From 8882135bcd332f294df5455747ea43ba9e6f77ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Nov 2010 19:01:43 +0100 Subject: perf: Fix owner-list vs exit Oleg noticed that a perf-fd keeping a reference on the creating task leads to a few funny side effects. There's two different aspects to this: - kernel based perf-events, these should not take out a reference on the creating task and appear on the task's event list since they're not bound to fds nor visible to userspace. - fork() and pthread_create(), these can lead to the creating task dying (and thus the task's event-list becomming useless) but keeping the list and ref alive until the event is closed. Combined they lead to malfunction of the ptrace hw_tracepoints. Cure this by not considering kernel based perf_events for the owner-list and destroying the owner-list when the owner dies. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov LKML-Reference: <1289576883.2084.286.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f818d9d2dc9..671f6c8c8a3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); - mutex_lock(&event->owner->perf_event_mutex); - list_del_init(&event->owner_entry); - mutex_unlock(&event->owner->perf_event_mutex); - put_task_struct(event->owner); - free_event(event); return 0; @@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { struct perf_event *event = file->private_data; + struct task_struct *owner; file->private_data = NULL; + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + return perf_event_release_kernel(event); } @@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); event->owner = current; - get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ++ctx->generation; mutex_unlock(&ctx->mutex); - event->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - return event; err_free: @@ -5902,8 +5925,24 @@ again: */ void perf_event_exit_task(struct task_struct *child) { + struct perf_event *event, *tmp; int ctxn; + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); } -- cgit v1.2.3 From 94e8ba728640dc01375a14e337f3b892bfacbeeb Mon Sep 17 00:00:00 2001 From: Sergio Aguirre Date: Tue, 16 Nov 2010 12:02:47 -0600 Subject: irq_work: Drop cmpxchg() result The compiler warned us about: kernel/irq_work.c: In function 'irq_work_run': kernel/irq_work.c:148: warning: value computed is not used Dropping the cmpxchg() result is indeed weird, but correct - so annotate away the warning. Signed-off-by: Sergio Aguirre Cc: Huang Ying Cc: Martin Schwidefsky Cc: Kyle McMartin Signed-off-by: Peter Zijlstra LKML-Reference: <1289930567-17828-1-git-send-email-saaguirre@ti.com> Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f16763ff848..90f881904bb 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -145,7 +145,9 @@ void irq_work_run(void) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + (void)cmpxchg(&entry->next, + next_flags(NULL, IRQ_WORK_BUSY), + NULL); } } EXPORT_SYMBOL_GPL(irq_work_run); -- cgit v1.2.3