From dddd3379a619a4cb8247bfd3c94ca9ae3797aa2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 Nov 2010 10:05:55 +0100 Subject: perf: Fix inherit vs. context rotation bug It was found that sometimes children of tasks with inherited events had one extra event. Eventually it turned out to be due to the list rotation no being exclusive with the list iteration in the inheritance code. Cure this by temporarily disabling the rotation while we inherit the events. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/perf_event.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 40150f34598..142e3d6042c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -850,6 +850,7 @@ struct perf_event_context { int nr_active; int is_active; int nr_stat; + int rotate_disable; atomic_t refcount; struct task_struct *task; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 671f6c8c8a3..f365dd8ef8b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1622,8 +1622,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -6162,6 +6166,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp[ctxn] = NULL; @@ -6202,6 +6207,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); @@ -6209,6 +6223,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { -- cgit v1.2.3 From 33c6d6a7ad0ffab9b1b15f8e4107a2af072a05a0 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Nov 2010 16:55:23 -0500 Subject: x86, perf, nmi: Disable perf if counters are not accessible In a kvm virt guests, the perf counters are not emulated. Instead they return zero on a rdmsrl. The perf nmi handler uses the fact that crossing a zero means the counter overflowed (for those counters that do not have specific interrupt bits). Therefore on kvm guests, perf will swallow all NMIs thinking the counters overflowed. This causes problems for subsystems like kgdb which needs NMIs to do its magic. This problem was discovered by running kgdb tests. The solution is to write garbage into a perf counter during the initialization and hopefully reading back the same number. On kvm guests, the value will be read back as zero and we disable perf as a result. Reported-by: Jason Wessel Patch-inspired-by: Peter Zijlstra Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <1290462923-30734-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ed6310183ef..6d75b9145b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -381,6 +381,20 @@ static void release_pmc_hardware(void) {} #endif +static bool check_hw_exists(void) +{ + u64 val, val_new = 0; + int ret = 0; + + val = 0xabcdUL; + ret |= checking_wrmsrl(x86_pmu.perfctr, val); + ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); + if (ret || val != val_new) + return false; + + return true; +} + static void reserve_ds_buffers(void); static void release_ds_buffers(void); @@ -1372,6 +1386,12 @@ void __init init_hw_perf_events(void) pmu_check_apic(); + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) { + pr_cont("Broken PMU hardware detected, software events only.\n"); + return; + } + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.quirks) -- cgit v1.2.3 From cc2067a51424dd25c10c1b1230b4222d8baec94d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Nov 2010 21:49:01 +0100 Subject: perf, x86: Fixup Kconfig deps This leads to a Kconfig dep inversion, x86 selects PERF_EVENT (due to a hw_breakpoint dep) but doesn't unconditionally provide HAVE_PERF_EVENT. (This can cause build failures on M386/M486 kernel .config's.) Signed-off-by: Peter Zijlstra LKML-Reference: <20101117222055.982965150@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e8327686d3c..e330da21b84 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,7 +21,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_PERF_EVENTS select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES -- cgit v1.2.3 From ee6dcfa40a50fe12a3ae0fb4d2653c66c3ed6556 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Nov 2010 13:49:04 +0100 Subject: perf: Fix the software context switch counter Stephane noticed that because the perf_sw_event() call is inside the perf_event_task_sched_out() call it won't get called unless we have a per-task counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 29 +++++++++++++++-------------- kernel/perf_event.c | 2 -- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 142e3d6042c..de2c41758e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -909,20 +909,6 @@ extern int perf_num_counters(void); extern const char *perf_pmu_name(void); extern void __perf_event_task_sched_in(struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); - -extern atomic_t perf_task_events; - -static inline void perf_event_task_sched_in(struct task_struct *task) -{ - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); -} - -static inline -void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) -{ - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); -} - extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); @@ -1031,6 +1017,21 @@ have_event: __perf_sw_event(event_id, nr, nmi, regs, addr); } +extern atomic_t perf_task_events; + +static inline void perf_event_task_sched_in(struct task_struct *task) +{ + COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); +} + +static inline +void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +{ + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f365dd8ef8b..eac7e336433 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1287,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); } -- cgit v1.2.3