From bc078e4eab65f11bbaeed380593ab8151b30d703 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 2 Mar 2010 16:01:10 +0100
Subject: oprofile: convert oprofile from timer_hook to hrtimer

Oprofile is currently broken on systems running with NOHZ enabled.
A maximum of 1 tick is accounted via the timer_hook if a cpu sleeps
for a longer period of time. This does bad things to the percentages
in the profiler output. To solve this problem convert oprofile to
use a restarting hrtimer instead of the timer_hook.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/oprof.c     | 12 ++++---
 drivers/oprofile/oprof.h     |  3 +-
 drivers/oprofile/timer_int.c | 78 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index dc8a0428260..b336cd9ee7a 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -253,22 +253,26 @@ static int __init oprofile_init(void)
 	int err;
 
 	err = oprofile_arch_init(&oprofile_ops);
-
 	if (err < 0 || timer) {
 		printk(KERN_INFO "oprofile: using timer interrupt.\n");
-		oprofile_timer_init(&oprofile_ops);
+		err = oprofile_timer_init(&oprofile_ops);
+		if (err)
+			goto out_arch;
 	}
-
 	err = oprofilefs_register();
 	if (err)
-		oprofile_arch_exit();
+		goto out_arch;
+	return 0;
 
+out_arch:
+	oprofile_arch_exit();
 	return err;
 }
 
 
 static void __exit oprofile_exit(void)
 {
+	oprofile_timer_exit();
 	oprofilefs_unregister();
 	oprofile_arch_exit();
 }
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index cb92f5c98c1..47e12cb4ee8 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -34,7 +34,8 @@ struct super_block;
 struct dentry;
 
 void oprofile_create_files(struct super_block *sb, struct dentry *root);
-void oprofile_timer_init(struct oprofile_operations *ops);
+int oprofile_timer_init(struct oprofile_operations *ops);
+void oprofile_timer_exit(void);
 
 int oprofile_set_backtrace(unsigned long depth);
 int oprofile_set_timeout(unsigned long time);
diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c
index 333f915568c..dc0ae4d14df 100644
--- a/drivers/oprofile/timer_int.c
+++ b/drivers/oprofile/timer_int.c
@@ -13,34 +13,94 @@
 #include <linux/oprofile.h>
 #include <linux/profile.h>
 #include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/hrtimer.h>
+#include <asm/irq_regs.h>
 #include <asm/ptrace.h>
 
 #include "oprof.h"
 
-static int timer_notify(struct pt_regs *regs)
+static DEFINE_PER_CPU(struct hrtimer, oprofile_hrtimer);
+
+static enum hrtimer_restart oprofile_hrtimer_notify(struct hrtimer *hrtimer)
+{
+	oprofile_add_sample(get_irq_regs(), 0);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(TICK_NSEC));
+	return HRTIMER_RESTART;
+}
+
+static void __oprofile_hrtimer_start(void *unused)
+{
+	struct hrtimer *hrtimer = &__get_cpu_var(oprofile_hrtimer);
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = oprofile_hrtimer_notify;
+
+	hrtimer_start(hrtimer, ns_to_ktime(TICK_NSEC),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static int oprofile_hrtimer_start(void)
 {
-	oprofile_add_sample(regs, 0);
+	on_each_cpu(__oprofile_hrtimer_start, NULL, 1);
 	return 0;
 }
 
-static int timer_start(void)
+static void __oprofile_hrtimer_stop(int cpu)
 {
-	return register_timer_hook(timer_notify);
+	struct hrtimer *hrtimer = &per_cpu(oprofile_hrtimer, cpu);
+
+	hrtimer_cancel(hrtimer);
 }
 
+static void oprofile_hrtimer_stop(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		__oprofile_hrtimer_stop(cpu);
+}
 
-static void timer_stop(void)
+static int __cpuinit oprofile_cpu_notify(struct notifier_block *self,
+					 unsigned long action, void *hcpu)
 {
-	unregister_timer_hook(timer_notify);
+	long cpu = (long) hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		smp_call_function_single(cpu, __oprofile_hrtimer_start,
+					 NULL, 1);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		__oprofile_hrtimer_stop(cpu);
+		break;
+	}
+	return NOTIFY_OK;
 }
 
+static struct notifier_block __refdata oprofile_cpu_notifier = {
+	.notifier_call = oprofile_cpu_notify,
+};
 
-void __init oprofile_timer_init(struct oprofile_operations *ops)
+int __init oprofile_timer_init(struct oprofile_operations *ops)
 {
+	int rc;
+
+	rc = register_hotcpu_notifier(&oprofile_cpu_notifier);
+	if (rc)
+		return rc;
 	ops->create_files = NULL;
 	ops->setup = NULL;
 	ops->shutdown = NULL;
-	ops->start = timer_start;
-	ops->stop = timer_stop;
+	ops->start = oprofile_hrtimer_start;
+	ops->stop = oprofile_hrtimer_stop;
 	ops->cpu_type = "timer";
+	return 0;
+}
+
+void __exit oprofile_timer_exit(void)
+{
+	unregister_hotcpu_notifier(&oprofile_cpu_notifier);
 }
-- 
cgit v1.2.3


From 4bdde044dc36ac7b01f7502394d52619af9d1927 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:05 +0800
Subject: tracing: Convert some signal events to DEFINE_TRACE

Use DECLARE_EVENT_CLASS to remove duplicate code:

text    data     bss     dec     hex filename
  23639    6084       8   29731    7423 kernel/signal.o.orig
  22727    6084       8   28819    7093 kernel/signal.o

2 events are converted:

  signal_queue_overflow: signal_overflow_fail, signal_lose_info

No functional change.

Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FBD.8070703@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/signal.h | 52 ++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a510b75ac30..814566c99d2 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
 		  __entry->sa_handler, __entry->sa_flags)
 );
 
-/**
- * signal_overflow_fail - called when signal queue is overflow
- * @sig: signal number
- * @group: signal to process group or not (bool)
- * @info: pointer to struct siginfo
- *
- * Kernel fails to generate 'sig' signal with 'info' siginfo, because
- * siginfo queue is overflow, and the signal is dropped.
- * 'group' is not 0 if the signal will be sent to a process group.
- * 'sig' is always one of RT signals.
- */
-TRACE_EVENT(signal_overflow_fail,
+DECLARE_EVENT_CLASS(signal_queue_overflow,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
@@ -134,6 +123,24 @@ TRACE_EVENT(signal_overflow_fail,
 		  __entry->sig, __entry->group, __entry->errno, __entry->code)
 );
 
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
+
+	TP_PROTO(int sig, int group, struct siginfo *info),
+
+	TP_ARGS(sig, group, info)
+);
+
 /**
  * signal_lose_info - called when siginfo is lost
  * @sig: signal number
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
  * 'group' is not 0 if the signal will be sent to a process group.
  * 'sig' is always one of non-RT signals.
  */
-TRACE_EVENT(signal_lose_info,
+DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
-	TP_ARGS(sig, group, info),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig	)
-		__field(	int,	group	)
-		__field(	int,	errno	)
-		__field(	int,	code	)
-	),
-
-	TP_fast_assign(
-		__entry->sig	= sig;
-		__entry->group	= group;
-		TP_STORE_SIGINFO(__entry, info);
-	),
-
-	TP_printk("sig=%d group=%d errno=%d code=%d",
-		  __entry->sig, __entry->group, __entry->errno, __entry->code)
+	TP_ARGS(sig, group, info)
 );
+
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 50354a8a28d0c91695a2d6d25b5a821bfe557a07 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:24 +0800
Subject: tracing: Update comments

Make some comments consistent with the code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FD0.7090202@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ea6f9d4a20e..75dd7787fb3 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -154,9 +154,11 @@
  *
  *	field = (typeof(field))entry;
  *
- *	p = get_cpu_var(ftrace_event_seq);
+ *	p = &get_cpu_var(ftrace_event_seq);
  *	trace_seq_init(p);
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	ret = trace_seq_printf(s, "%s: ", <call>);
+ *	if (ret)
+ *		ret = trace_seq_printf(s, <TP_printk> "\n");
  *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
@@ -450,38 +452,38 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
+ *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
  *	struct ring_buffer *buffer;
  *	unsigned long irq_flags;
+ *	int __data_size;
  *	int pc;
  *
  *	local_save_flags(irq_flags);
  *	pc = preempt_count();
  *
+ *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
  *	event = trace_current_buffer_lock_reserve(&buffer,
  *				  event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
+ *				  sizeof(*entry) + __data_size,
  *				  irq_flags, pc);
  *	if (!event)
  *		return;
  *	entry	= ring_buffer_event_data(event);
  *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
+ *	{ <assign>; }  <-- Here we assign the entries by the __field and
+ *			   __array macros.
  *
- *	trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ *	if (!filter_current_check_discard(buffer, event_call, entry, event))
+ *		trace_current_buffer_unlock_commit(buffer,
+ *						   event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
+ *	return register_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
@@ -493,6 +495,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
@@ -501,6 +505,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.raw_init		= trace_event_raw_init,
  *	.regfunc		= ftrace_reg_event_<call>,
  *	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.print_fmt		= print_fmt_<call>,
+ *	.define_fields		= ftrace_define_fields_<call>,
  * }
  *
  */
@@ -569,7 +575,6 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,	\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-									\
 	tstruct								\
 									\
 	{ assign; }							\
-- 
cgit v1.2.3


From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:57:43 +0800
Subject: tracing: Remove side effect from module tracepoints that caused a GPF

Remove the @refcnt argument, because it has side-effects, and arguments with
side-effects are not skipped by the jump over disabled instrumentation and are
executed even when the tracepoint is disabled.

This was also causing a GPF as found by Randy Dunlap:

Subject: 2.6.33 GP fault only when built with tracing
LKML-Reference: <4BA2B69D.3000309@oracle.com>

Note, the current 2.6.34-rc has a fix for the actual cause of the GPF,
but this fixes one of its triggers.

Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h        |  6 ++----
 include/trace/events/module.h | 14 +++++++-------
 kernel/module.c               |  3 +--
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 5e869ffd34a..393ec39b580 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -460,8 +460,7 @@ static inline void __module_get(struct module *module)
 	if (module) {
 		preempt_disable();
 		__this_cpu_inc(module->refptr->count);
-		trace_module_get(module, _THIS_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_get(module, _THIS_IP_);
 		preempt_enable();
 	}
 }
@@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module)
 
 		if (likely(module_is_live(module))) {
 			__this_cpu_inc(module->refptr->count);
-			trace_module_get(module, _THIS_IP_,
-				__this_cpu_read(module->refptr->count));
+			trace_module_get(module, _THIS_IP_);
 		}
 		else
 			ret = 0;
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a..a585f8135bd 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -53,9 +53,9 @@ TRACE_EVENT(module_free,
 
 DECLARE_EVENT_CLASS(module_refcnt,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt),
+	TP_ARGS(mod, ip),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	ip		)
@@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_fast_assign(
 		__entry->ip	= ip;
-		__entry->refcnt	= refcnt;
+		__entry->refcnt	= __this_cpu_read(mod->refptr->count);
 		__assign_str(name, mod->name);
 	),
 
@@ -75,16 +75,16 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 DEFINE_EVENT(module_refcnt, module_get,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 DEFINE_EVENT(module_refcnt, module_put,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 TRACE_EVENT(module_request,
diff --git a/kernel/module.c b/kernel/module.c
index c968d3606dc..21591ad921f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -800,8 +800,7 @@ void module_put(struct module *module)
 		preempt_disable();
 		__this_cpu_dec(module->refptr->count);
 
-		trace_module_put(module, _RET_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_put(module, _RET_IP_);
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
-- 
cgit v1.2.3


From eb0c53771fb2f5f66b0edb3ebce33be4bbf1c285 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 29 Mar 2010 14:25:18 -0400
Subject: tracing: Fix compile error in module tracepoints when MODULE_UNLOAD
 not set

If modules are configured in the build but unloading of modules is not,
then the refcnt is not defined. Place the get/put module tracepoints
under CONFIG_MODULE_UNLOAD since it references this field in the module
structure.

As a side-effect, this patch also reduces the code when MODULE_UNLOAD
is not set, because these unused tracepoints are not created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/module.h | 4 ++++
 kernel/module.c               | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index a585f8135bd..f07b44a2b24 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,6 +51,9 @@ TRACE_EVENT(module_free,
 	TP_printk("%s", __get_str(name))
 );
 
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
+
 DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_PROTO(struct module *mod, unsigned long ip),
@@ -86,6 +89,7 @@ DEFINE_EVENT(module_refcnt, module_put,
 
 	TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 
 TRACE_EVENT(module_request,
 
diff --git a/kernel/module.c b/kernel/module.c
index 21591ad921f..d9e237926b6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
 #if 0
 #define DEBUGP printk
 #else
@@ -467,6 +465,9 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 #ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-- 
cgit v1.2.3


From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 13:21:56 -0400
Subject: ring-buffer: Add place holder recording of dropped events

Currently, when the ring buffer drops events, it does not record
the fact that it did so. It does inform the writer that the event
was dropped by returning a NULL event, but it does not put in any
place holder where the event was dropped.

This is not a trivial thing to add because the ring buffer mostly
runs in overwrite (flight recorder) mode. That is, when the ring
buffer is full, new data will overwrite old data.

In a produce/consumer mode, where new data is simply dropped when
the ring buffer is full, it is trivial to add the placeholder
for dropped events. When there's more room to write new data, then
a special event can be added to notify the reader about the dropped
events.

But in overwrite mode, any new write can overwrite events. A place
holder can not be inserted into the ring buffer since there never
may be room. A reader could also come in at anytime and miss the
placeholder.

Luckily, the way the ring buffer works, the read side can find out
if events were lost or not, and how many events. Everytime a write
takes place, if it overwrites the header page (the next read) it
updates a "overrun" variable that keeps track of the number of
lost events. When a reader swaps out a page from the ring buffer,
it can record this number, perfom the swap, and then check to
see if the number changed, and take the diff if it has, which would be
the number of events dropped. This can be stored by the reader
and returned to callers of the reader.

Since the reader page swap will fail if the writer moved the head
page since the time the reader page set up the swap, this gives room
to record the overruns without worrying about races. If the reader
sets up the pages, records the overrun, than performs the swap,
if the swap succeeds, then the overrun variable has not been
updated since the setup before the swap.

For binary readers of the ring buffer, a flag is set in the header
of each sub page (sub buffer) of the ring buffer. This flag is embedded
in the size field of the data on the sub buffer, in the 31st bit (the size
can be 32 or 64 bits depending on the architecture), but only 27
bits needs to be used for the actual size (less actually).

We could add a new field in the sub buffer header to also record the
number of events dropped since the last read, but this will change the
format of the binary ring buffer a bit too much. Perhaps this change can
be made if the information on the number of events dropped is considered
important enough.

Note, the notification of dropped events is only used by consuming reads
or peeking at the ring buffer. Iterating over the ring buffer does not
keep this information because the necessary data is only available when
a page swap is made, and the iterator does not swap out pages.

Cc: Robert Richter <robert.richter@amd.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 drivers/oprofile/cpu_buffer.c        |  4 +-
 include/linux/ring_buffer.h          |  6 ++-
 kernel/trace/ring_buffer.c           | 72 +++++++++++++++++++++++++++++++++---
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 |  4 +-
 kernel/trace/trace_functions_graph.c |  5 ++-
 kernel/trace/trace_selftest.c        |  2 +-
 7 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622..7581dbe456d 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
 	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	if (ring_buffer_swap_cpu(op_ring_buffer_read,
 				 op_ring_buffer_write,
 				 cpu))
 		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	return NULL;
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed577..c8297761e41 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d1187ef20ca..8295650444c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS	(1 << 31)
+
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
 	local_t		 commit;	/* write committed index */
@@ -416,6 +419,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 			       (unsigned int)sizeof(field.commit),
 			       (unsigned int)is_signed_type(long));
 
+	ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       1,
+			       (unsigned int)is_signed_type(long));
+
 	ret = trace_seq_printf(s, "\tfield: char data;\t"
 			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), data),
@@ -439,6 +448,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			lost_events;
+	unsigned long			last_overrun;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
@@ -2835,6 +2846,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
 	int ret;
@@ -2895,6 +2907,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	/* The reader page will be pointing to the new head */
 	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
 
+	/*
+	 * We want to make sure we read the overruns after we set up our
+	 * pointers to the next object. The writer side does a
+	 * cmpxchg to cross pages which acts as the mb on the writer
+	 * side. Note, the reader will constantly fail the swap
+	 * while the writer is updating the pointers, so this
+	 * guarantees that the overwrite recorded here is the one we
+	 * want to compare with the last_overrun.
+	 */
+	smp_mb();
+	overwrite = local_read(&(cpu_buffer->overrun));
+
 	/*
 	 * Here's the tricky part.
 	 *
@@ -2926,6 +2950,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page = reader;
 	rb_reset_reader_page(cpu_buffer);
 
+	if (overwrite != cpu_buffer->last_overrun) {
+		cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+		cpu_buffer->last_overrun = overwrite;
+	}
+
 	goto again;
 
  out:
@@ -3002,8 +3031,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->lost_events;
+}
+
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+	       unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct buffer_page *reader;
@@ -3055,6 +3090,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
+		if (lost_events)
+			*lost_events = rb_lost_events(cpu_buffer);
 		return event;
 
 	default:
@@ -3165,12 +3202,14 @@ static inline int rb_ok_to_lock(void)
  * @buffer: The ring buffer to read
  * @cpu: The cpu to peak at
  * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * This will return the event that will be read next, but does
  * not consume the data.
  */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -3185,7 +3224,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	local_irq_save(flags);
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
-	event = rb_buffer_peek(cpu_buffer, ts);
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
@@ -3227,13 +3266,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * Returns the next event in the ring buffer, and that event is consumed.
  * Meaning, that sequential reads will keep returning a different event,
  * and eventually empty the ring buffer if the producer is slower.
  */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
@@ -3254,9 +3297,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 
-	event = rb_buffer_peek(cpu_buffer, ts);
-	if (event)
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+	if (event) {
+		cpu_buffer->lost_events = 0;
 		rb_advance_reader(cpu_buffer);
+	}
 
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
@@ -3405,6 +3450,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
 
+	cpu_buffer->lost_events = 0;
+	cpu_buffer->last_overrun = 0;
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -3684,6 +3732,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
+	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3716,6 +3765,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	read = reader->read;
 	commit = rb_page_commit(reader);
 
+	/* Check if any events were dropped */
+	if (cpu_buffer->lost_events)
+		missed_events = 1;
+
 	/*
 	 * If this page has been partially read or
 	 * if len is not big enough to read the rest of the page or
@@ -3779,6 +3832,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
+	cpu_buffer->lost_events = 0;
+	/*
+	 * Set a flag in the commit field if we lost events
+	 */
+	if (missed_events)
+		local_add(RB_MISSED_EVENTS, &bpage->commit);
+
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c798225..dc56556b55a 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
 	int *entry;
 	u64 ts;
 
-	event = ring_buffer_consume(buffer, cpu, &ts);
+	event = ring_buffer_consume(buffer, cpu, &ts, NULL);
 	if (!event)
 		return EVENT_DROPPED;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec2ee6f656..fabb0033a9b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1556,7 +1556,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
 
 	ftrace_enable_cpu();
 
@@ -1635,7 +1635,7 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
 	ftrace_enable_cpu();
 }
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e6989d9b44d..a7f75fb10aa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -489,9 +489,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+					    NULL, NULL);
 			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-						 NULL);
+						 NULL, NULL);
 		}
 
 		if (!event)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d6..e50180874c6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
-- 
cgit v1.2.3


From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 19:49:26 -0400
Subject: tracing: Show the lost events in the trace_pipe output

Now that the ring buffer can keep track of where events are lost.
Use this information to the output of trace_pipe:

       hackbench-3588  [001]  1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock
       hackbench-3588  [001]  1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock
       hackbench-3588  [001]  1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock
CPU:1 [LOST 673 EVENTS]
       hackbench-3588  [001]  1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738
       hackbench-3588  [001]  1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem
       hackbench-3588  [001]  1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem

Even works with the function graph tracer:

 2) ! 170.098 us  |                                            }
 2)   4.036 us    |                                            rcu_irq_exit();
 2)   3.657 us    |                                            idle_cpu();
 2) ! 190.301 us  |                                          }
CPU:2 [LOST 2196 EVENTS]
 2)   0.853 us    |                            } /* cancel_dirty_page */
 2)               |                            remove_from_page_cache() {
 2)   1.578 us    |                              _raw_spin_lock_irq();
 2)               |                              __remove_from_page_cache() {

Note, it does not work with the iterator "trace" file, since it requires
the use of consuming the page from the ring buffer to determine how many
events were lost, which the iterator does not do.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace.c         | 30 ++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c71..39e71b0a3bf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
+	unsigned long		lost_events;
 	int			leftover;
 	int			cpu;
 	u64			ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fabb0033a9b..0498bebcbfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1545,7 +1545,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+		unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1557,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+					 lost_events);
 
 	ftrace_enable_cpu();
 
@@ -1564,10 +1566,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+		  unsigned long *missing_events, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	unsigned long lost_events, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
@@ -1580,7 +1584,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (cpu_file > TRACE_PIPE_ALL_CPU) {
 		if (ring_buffer_empty_cpu(buffer, cpu_file))
 			return NULL;
-		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
 		if (ent_cpu)
 			*ent_cpu = cpu_file;
 
@@ -1592,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 		if (ring_buffer_empty_cpu(buffer, cpu))
 			continue;
 
-		ent = peek_next_entry(iter, cpu, &ts);
+		ent = peek_next_entry(iter, cpu, &ts, &lost_events);
 
 		/*
 		 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1605,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 			next = ent;
 			next_cpu = cpu;
 			next_ts = ts;
+			next_lost = lost_events;
 		}
 	}
 
@@ -1610,6 +1615,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (ent_ts)
 		*ent_ts = next_ts;
 
+	if (missing_events)
+		*missing_events = next_lost;
+
 	return next;
 }
 
@@ -1617,13 +1625,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
 {
-	return __find_next_entry(iter, ent_cpu, ent_ts);
+	return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+	iter->ent = __find_next_entry(iter, &iter->cpu,
+				      &iter->lost_events, &iter->ts);
 
 	if (iter->ent)
 		trace_iterator_increment(iter);
@@ -1635,7 +1644,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+			    &iter->lost_events);
 	ftrace_enable_cpu();
 }
 
@@ -2030,6 +2040,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
 
+	if (iter->lost_events)
+		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+				 iter->cpu, iter->lost_events);
+
 	if (iter->trace && iter->trace->print_line) {
 		ret = iter->trace->print_line(iter);
 		if (ret != TRACE_TYPE_UNHANDLED)
-- 
cgit v1.2.3


From ff0ff84a0767df48d728c36510365344a7e7d582 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 22:11:42 -0400
Subject: ring-buffer: Add lost event count to end of sub buffer

Currently, binary readers of the ring buffer only know where events were
lost, but not how many events were lost at that location.
This information is available, but it would require adding another
field to the sub buffer header to include it.

But when a event can not fit at the end of a sub buffer, it is written
to the next sub buffer. This means there is a good chance that the
buffer may have room to hold this counter. If it does, write
the counter at the end of the sub buffer and set another flag
in the data size field that states that this information exists.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8295650444c..dc6d563a6d2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -320,6 +320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 /* Flag when events were overwritten */
 #define RB_MISSED_EVENTS	(1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED	(1 << 30)
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
@@ -340,6 +342,7 @@ struct buffer_page {
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	local_t		 entries;	/* entries on this page */
+	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -1769,6 +1772,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 
+	/*
+	 * Save the original length to the meta data.
+	 * This will be used by the reader to add lost event
+	 * counter.
+	 */
+	tail_page->real_end = tail;
+
 	/*
 	 * If this event is bigger than the minimum size, then
 	 * we need to be careful that we don't subtract the
@@ -2888,6 +2898,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
+	cpu_buffer->reader_page->real_end = 0;
 
  spin:
 	/*
@@ -3728,11 +3739,11 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	struct buffer_page *reader;
+	unsigned long missed_events;
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
-	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3766,8 +3777,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/* Check if any events were dropped */
-	if (cpu_buffer->lost_events)
-		missed_events = 1;
+	missed_events = cpu_buffer->lost_events;
 
 	/*
 	 * If this page has been partially read or
@@ -3829,6 +3839,14 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
+
+		/*
+		 * Use the real_end for the data size,
+		 * This gives us a chance to store the lost events
+		 * on the page.
+		 */
+		if (reader->real_end)
+			local_set(&bpage->commit, reader->real_end);
 	}
 	ret = read;
 
@@ -3836,8 +3854,19 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/*
 	 * Set a flag in the commit field if we lost events
 	 */
-	if (missed_events)
+	if (missed_events) {
+		commit = local_read(&bpage->commit);
+
+		/* If there is room at the end of the page to save the
+		 * missed events, then record it there.
+		 */
+		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+			memcpy(&bpage->data[commit], &missed_events,
+			       sizeof(missed_events));
+			local_add(RB_MISSED_STORED, &bpage->commit);
+		}
 		local_add(RB_MISSED_EVENTS, &bpage->commit);
+	}
 
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-- 
cgit v1.2.3


From cb6e943ccf19ab6d3189147e9d625a992e016084 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 1 Apr 2010 03:17:25 +0200
Subject: oprofile: remove double ring buffering

oprofile used a double buffer scheme for its cpu event buffer
to avoid races on reading with the old locked ring buffer.

But that is obsolete now with the new ring buffer, so simply
use a single buffer. This greatly simplifies the code and avoids
a lot of sample drops on large runs, especially with call graph.

Based on suggestions from Steven Rostedt

For stable kernels from v2.6.32, but not earlier.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/cpu_buffer.c | 63 +++++++++----------------------------------
 1 file changed, 13 insertions(+), 50 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622..de82183bb9b 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -30,23 +30,7 @@
 
 #define OP_BUFFER_FLAGS	0
 
-/*
- * Read and write access is using spin locking. Thus, writing to the
- * buffer by NMI handler (x86) could occur also during critical
- * sections when reading the buffer. To avoid this, there are 2
- * buffers for independent read and write access. Read access is in
- * process context only, write access only in the NMI handler. If the
- * read buffer runs empty, both buffers are swapped atomically. There
- * is potentially a small window during swapping where the buffers are
- * disabled and samples could be lost.
- *
- * Using 2 buffers is a little bit overhead, but the solution is clear
- * and does not require changes in the ring buffer implementation. It
- * can be changed to a single buffer solution when the ring buffer
- * access is implemented as non-locking atomic code.
- */
-static struct ring_buffer *op_ring_buffer_read;
-static struct ring_buffer *op_ring_buffer_write;
+static struct ring_buffer *op_ring_buffer;
 DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
 
 static void wq_sync_buffer(struct work_struct *work);
@@ -68,12 +52,9 @@ void oprofile_cpu_buffer_inc_smpl_lost(void)
 
 void free_cpu_buffers(void)
 {
-	if (op_ring_buffer_read)
-		ring_buffer_free(op_ring_buffer_read);
-	op_ring_buffer_read = NULL;
-	if (op_ring_buffer_write)
-		ring_buffer_free(op_ring_buffer_write);
-	op_ring_buffer_write = NULL;
+	if (op_ring_buffer)
+		ring_buffer_free(op_ring_buffer);
+	op_ring_buffer = NULL;
 }
 
 #define RB_EVENT_HDR_SIZE 4
@@ -86,11 +67,8 @@ int alloc_cpu_buffers(void)
 	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
 						 RB_EVENT_HDR_SIZE);
 
-	op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
-	if (!op_ring_buffer_read)
-		goto fail;
-	op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
-	if (!op_ring_buffer_write)
+	op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
+	if (!op_ring_buffer)
 		goto fail;
 
 	for_each_possible_cpu(i) {
@@ -162,16 +140,11 @@ struct op_sample
 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 {
 	entry->event = ring_buffer_lock_reserve
-		(op_ring_buffer_write, sizeof(struct op_sample) +
+		(op_ring_buffer, sizeof(struct op_sample) +
 		 size * sizeof(entry->sample->data[0]));
-	if (entry->event)
-		entry->sample = ring_buffer_event_data(entry->event);
-	else
-		entry->sample = NULL;
-
-	if (!entry->sample)
+	if (!entry->event)
 		return NULL;
-
+	entry->sample = ring_buffer_event_data(entry->event);
 	entry->size = size;
 	entry->data = entry->sample->data;
 
@@ -180,25 +153,16 @@ struct op_sample
 
 int op_cpu_buffer_write_commit(struct op_entry *entry)
 {
-	return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event);
+	return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
 }
 
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
 	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		goto event;
-	if (ring_buffer_swap_cpu(op_ring_buffer_read,
-				 op_ring_buffer_write,
-				 cpu))
+	e = ring_buffer_consume(op_ring_buffer, cpu, NULL);
+	if (!e)
 		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		goto event;
-	return NULL;
 
-event:
 	entry->event = e;
 	entry->sample = ring_buffer_event_data(e);
 	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
@@ -209,8 +173,7 @@ event:
 
 unsigned long op_cpu_buffer_entries(int cpu)
 {
-	return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
-		+ ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
+	return ring_buffer_entries_cpu(op_ring_buffer, cpu);
 }
 
 static int
-- 
cgit v1.2.3


From 9414e99672271adcc661f3c160a30b374179b92f Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Wed, 28 Apr 2010 12:09:16 -0500
Subject: oprofile: protect from not being in an IRQ context

http://lkml.org/lkml/2010/4/27/285

Protect against dereferencing regs when it's NULL, and
force a magic number into pc to prevent too deep processing.
This approach permits the dropped samples to be tallied as
invalid Instruction Pointer events.

e.g. output from about 15mins at 10kHz sample rate:
Nr. samples received: 2565380
Nr. samples lost invalid pc: 4

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/cpu_buffer.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 0ac8b065ee0..219f79e2210 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -319,8 +319,16 @@ void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 {
-	int is_kernel = !user_mode(regs);
-	unsigned long pc = profile_pc(regs);
+	int is_kernel;
+	unsigned long pc;
+
+	if (likely(regs)) {
+		is_kernel = !user_mode(regs);
+		pc = profile_pc(regs);
+	} else {
+		is_kernel = 0;    /* This value will not be used */
+		pc = ESCAPE_CODE; /* as this causes an early return. */
+	}
 
 	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
 }
-- 
cgit v1.2.3


From 81c4a8a6733ad2ff49c0e077b51403367601b3e7 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 22 Apr 2010 19:14:49 +0200
Subject: oprofile: update file list in MAINTAINERS file

File list now catches:

 $ xargs | eval ls -d $(cat) | sort -u
 arch/*/include/asm/oprofile*.h
 arch/*/oprofile/
 drivers/oprofile/
 include/linux/oprofile.h

 arch/alpha/oprofile/
 arch/arm/oprofile/
 arch/avr32/oprofile/
 arch/blackfin/oprofile/
 arch/ia64/oprofile/
 arch/m32r/oprofile/
 arch/microblaze/oprofile/
 arch/mips/oprofile/
 arch/mn10300/oprofile/
 arch/parisc/oprofile/
 arch/powerpc/include/asm/oprofile_impl.h
 arch/powerpc/oprofile/
 arch/s390/oprofile/
 arch/sh/oprofile/
 arch/sparc/oprofile/
 arch/x86/oprofile/
 drivers/oprofile/
 include/linux/oprofile.h

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a0e3c3a47a5..31e858617d0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4165,6 +4165,7 @@ OPROFILE
 M:	Robert Richter <robert.richter@amd.com>
 L:	oprofile-list@lists.sf.net
 S:	Maintained
+F:	arch/*/include/asm/oprofile*.h
 F:	arch/*/oprofile/
 F:	drivers/oprofile/
 F:	include/linux/oprofile.h
-- 
cgit v1.2.3


From 8f5a2dd83a1f8e89fdc17eb0f2f07c2e713e635a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 19:09:51 +0100
Subject: oprofile/x86: rework error handler in nmi_setup()

This patch improves the error handler in nmi_setup(). Most parts of
the code are moved to allocate_msrs(). In case of an error
allocate_msrs() also frees already allocated memory. nmi_setup()
becomes easier and better extendable.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee7101..c0c21f200fa 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -295,6 +295,7 @@ static void free_msrs(void)
 		kfree(per_cpu(cpu_msrs, i).controls);
 		per_cpu(cpu_msrs, i).controls = NULL;
 	}
+	nmi_shutdown_mux();
 }
 
 static int allocate_msrs(void)
@@ -307,14 +308,21 @@ static int allocate_msrs(void)
 		per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).counters)
-			return 0;
+			goto fail;
 		per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).controls)
-			return 0;
+			goto fail;
 	}
 
+	if (!nmi_setup_mux())
+		goto fail;
+
 	return 1;
+
+fail:
+	free_msrs();
+	return 0;
 }
 
 static void nmi_cpu_setup(void *dummy)
@@ -342,17 +350,7 @@ static int nmi_setup(void)
 	int cpu;
 
 	if (!allocate_msrs())
-		err = -ENOMEM;
-	else if (!nmi_setup_mux())
-		err = -ENOMEM;
-	else
-		err = register_die_notifier(&profile_exceptions_nb);
-
-	if (err) {
-		free_msrs();
-		nmi_shutdown_mux();
-		return err;
-	}
+		return -ENOMEM;
 
 	/* We need to serialize save and setup for HT because the subset
 	 * of msrs are distinct for save and setup operations
@@ -374,9 +372,17 @@ static int nmi_setup(void)
 
 		mux_clone(cpu);
 	}
+
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
+fail:
+	free_msrs();
+	return err;
 }
 
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -421,7 +427,6 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
-	nmi_shutdown_mux();
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();
-- 
cgit v1.2.3


From d0e4120fda6f87eead438eed4d49032e12060e58 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 19:33:21 +0100
Subject: oprofile/x86: reserve counter msrs pairwise

For AMD's and Intel's P6 generic performance counters have pairwise
counter and control msrs. This patch changes the counter reservation
in a way that both msrs must be registered. It joins some counter
loops and also removes the unnecessary NUM_CONTROLS macro in the AMD
implementation.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 43 +++++++++++++++++----------------------
 arch/x86/oprofile/op_model_ppro.c | 36 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7db..2e2bc902b86 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
 #include "op_counter.h"
 
 #define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 #define NUM_VIRT_COUNTERS 32
-#define NUM_VIRT_CONTROLS 32
 #else
 #define NUM_VIRT_COUNTERS NUM_COUNTERS
-#define NUM_VIRT_CONTROLS NUM_CONTROLS
 #endif
 
 #define OP_EVENT_MASK			0x0FFF
@@ -134,13 +131,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < NUM_CONTROLS; i++) {
-		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
 	}
 }
 
@@ -160,7 +159,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 	}
 
 	/* clear all counters */
-	for (i = 0; i < NUM_CONTROLS; ++i) {
+	for (i = 0; i < NUM_COUNTERS; ++i) {
 		if (unlikely(!msrs->controls[i].addr)) {
 			if (counter_config[i].enabled && !smp_processor_id())
 				/*
@@ -175,12 +174,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -430,12 +427,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-	}
-	for (i = 0; i < NUM_CONTROLS; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
 	}
 }
 
@@ -583,7 +578,7 @@ static void op_amd_exit(void)
 
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
-	.num_controls		= NUM_CONTROLS,
+	.num_controls		= NUM_COUNTERS,
 	.num_virt_counters	= NUM_VIRT_COUNTERS,
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b..f8e268e8e99 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -35,13 +35,15 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
-		if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < num_counters; i++) {
-		if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
 	}
 }
 
@@ -92,12 +94,10 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI *
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -194,12 +194,10 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-	}
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
 	}
 	if (reset_value) {
 		kfree(reset_value);
-- 
cgit v1.2.3


From 83300ce0df6b72e156b386457aa0f0902b8c0a98 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 20:01:54 +0100
Subject: oprofile/x86: moving shutdown functions

Moving some code in preparation of the next patch.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 24 ++++++++++++------------
 arch/x86/oprofile/op_model_p4.c   | 38 ++++++++++++++++++--------------------
 arch/x86/oprofile/op_model_ppro.c | 33 ++++++++++++++++-----------------
 3 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 2e2bc902b86..7e5886d54bd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -126,6 +126,18 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
 
 /* functions for op_amd_spec */
 
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
 static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -422,18 +434,6 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
 static u8 ibs_eilvt_off;
 
 static inline void apic_init_ibs_nmi_per_cpu(void *arg)
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684..7cc80df330d 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,6 +385,24 @@ static unsigned int get_stagger(void)
 
 static unsigned long reset_value[NUM_COUNTERS_NON_HT];
 
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (msrs->counters[i].addr)
+			release_perfctr_nmi(msrs->counters[i].addr);
+	}
+	/*
+	 * some of the control registers are specially reserved in
+	 * conjunction with the counter registers (hence the starting offset).
+	 * This saves a few bits.
+	 */
+	for (i = num_counters; i < num_controls; ++i) {
+		if (msrs->controls[i].addr)
+			release_evntsel_nmi(msrs->controls[i].addr);
+	}
+}
 
 static void p4_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -668,26 +686,6 @@ static void p4_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(msrs->counters[i].addr);
-	}
-	/*
-	 * some of the control registers are specially reserved in
-	 * conjunction with the counter registers (hence the starting offset).
-	 * This saves a few bits.
-	 */
-	for (i = num_counters; i < num_controls; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(msrs->controls[i].addr);
-	}
-}
-
-
 #ifdef CONFIG_SMP
 struct op_x86_model_spec op_p4_ht2_spec = {
 	.num_counters		= NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index f8e268e8e99..b07d25a52f0 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,6 +30,22 @@ static int counter_width = 32;
 
 static u64 *reset_value;
 
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+	}
+	if (reset_value) {
+		kfree(reset_value);
+		reset_value = NULL;
+	}
+}
+
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -189,23 +205,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
-	}
-	if (reset_value) {
-		kfree(reset_value);
-		reset_value = NULL;
-	}
-}
-
-
 struct op_x86_model_spec op_ppro_spec = {
 	.num_counters		= 2,
 	.num_controls		= 2,
-- 
cgit v1.2.3


From 8617f98c001d00b176422d707e6a67b88bcd7e0d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 26 Feb 2010 17:20:55 +0100
Subject: oprofile/x86: return -EBUSY if counters are already reserved

In case a counter is already reserved by the watchdog or perf_event
subsystem, oprofile ignored this counters silently. This case is
handled now and oprofile_setup() now reports an error.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c       |  5 ++++-
 arch/x86/oprofile/op_model_amd.c  | 24 +++++++++++++-----------
 arch/x86/oprofile/op_model_p4.c   | 14 +++++++++++++-
 arch/x86/oprofile/op_model_ppro.c | 24 +++++++++++++-----------
 arch/x86/oprofile/op_x86_model.h  |  2 +-
 5 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c0c21f200fa..9f001d90459 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -357,7 +357,10 @@ static int nmi_setup(void)
 	 */
 
 	/* Assume saved/restored counters are the same on all CPUs */
-	model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
 	for_each_possible_cpu(cpu) {
 		if (!cpu)
 			continue;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 7e5886d54bd..536d0b0b39a 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -138,21 +138,30 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
 		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -172,15 +181,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 7cc80df330d..182558dd551 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -404,7 +404,7 @@ static void p4_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_fill_in_addresses(struct op_msrs * const msrs)
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
 {
 	unsigned int i;
 	unsigned int addr, cccraddr, stag;
@@ -486,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
 			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
 		}
 	}
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!counter_config[i].enabled)
+			continue;
+		if (msrs->controls[i].addr)
+			continue;
+		op_x86_warn_reserved(i);
+		p4_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index b07d25a52f0..1fd17cfb956 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -46,21 +46,30 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_fill_in_addresses(struct op_msrs * const msrs)
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
 		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		ppro_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 
@@ -96,15 +105,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd..551401398fb 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -41,7 +41,7 @@ struct op_x86_model_spec {
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
 	void		(*exit)(void);
-	void		(*fill_in_addresses)(struct op_msrs * const msrs);
+	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
 	int		(*check_ctrs)(struct pt_regs * const regs,
-- 
cgit v1.2.3


From da759fe5be24ec3b236a76c007b460cf6caf2009 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 26 Feb 2010 10:54:56 +0100
Subject: oprofile/x86: move IBS code

Moving code to make future changes easier. This groups all IBS code
together.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 220 +++++++++++++++++++--------------------
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 536d0b0b39a..e159254fb7c 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -102,116 +102,6 @@ static u32 get_ibs_caps(void)
 	return ibs_caps;
 }
 
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
-			       struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			goto fail;
-		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			goto fail;
-		}
-		/* both registers must be reserved */
-		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
-		continue;
-	fail:
-		if (!counter_config[i].enabled)
-			continue;
-		op_x86_warn_reserved(i);
-		op_amd_shutdown(msrs);
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
-			      struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* setup reset_value */
-	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
-		if (counter_config[i].enabled
-		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
-			reset_value[i] = counter_config[i].count;
-		else
-			reset_value[i] = 0;
-	}
-
-	/* clear all counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->controls[i].addr)
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
-			op_x86_warn_in_use(i);
-		val &= model->reserved;
-		wrmsrl(msrs->controls[i].addr, val);
-		/*
-		 * avoid a false detection of ctr overflows in NMI
-		 * handler
-		 */
-		wrmsrl(msrs->counters[i].addr, -1LL);
-	}
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-
-		/* setup counter registers */
-		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
-		/* setup control registers */
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
 /*
  * 16-bit Linear Feedback Shift Register (LFSR)
  *
@@ -376,6 +266,116 @@ static void op_amd_stop_ibs(void)
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+			       struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; i++) {
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			goto fail;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			goto fail;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+			      struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* setup reset_value */
+	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+		if (counter_config[i].enabled
+		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
+			reset_value[i] = counter_config[i].count;
+		else
+			reset_value[i] = 0;
+	}
+
+	/* clear all counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->controls[i].addr)
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+			op_x86_warn_in_use(i);
+		val &= model->reserved;
+		wrmsrl(msrs->controls[i].addr, val);
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
+		wrmsrl(msrs->counters[i].addr, -1LL);
+	}
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+
+		/* setup counter registers */
+		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+		/* setup control registers */
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
 static int op_amd_check_ctrs(struct pt_regs * const regs,
 			     struct op_msrs const * const msrs)
 {
-- 
cgit v1.2.3


From 5bdb7934ca4115a12c7d585c5a45312b1c36909b Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 31 Mar 2010 11:58:36 +0200
Subject: oprofile/x86: remove duplicate IBS capability check

The check is already done in ibs_exit().

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index e159254fb7c..384c5241048 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -490,8 +490,7 @@ static int init_ibs_nmi(void)
 /* uninitialize the APIC for the IBS interrupts if needed */
 static void clear_ibs_nmi(void)
 {
-	if (ibs_caps)
-		on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
 }
 
 /* initialize the APIC for the IBS interrupts if available */
-- 
cgit v1.2.3


From 2623a1d55a6260c855e1f6d1895900b50b40a896 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 19:44:32 +0200
Subject: oprofile/x86: fix uninitialized counter usage during cpu hotplug

This fixes a NULL pointer dereference that is triggered when taking a
cpu offline after oprofile was initialized, e.g.:

 $ opcontrol --init
 $ opcontrol --start-daemon
 $ opcontrol --shutdown
 $ opcontrol --deinit
 $ echo 0 > /sys/devices/system/cpu/cpu1/online

See the crash dump below. Though the counter has been disabled the cpu
notifier is still active and trying to use already freed counter data.

This fix is for linux-stable. To proper fix this, the hotplug code
must be rewritten. Thus I will leave a WARN_ON_ONCE() message with
this patch.

BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
PGD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu1/online
CPU 1
Modules linked in:

Pid: 0, comm: swapper Not tainted 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Anaheim/Anaheim
RIP: 0010:[<ffffffff8132ad57>]  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
RSP: 0018:ffff880001843f28  EFLAGS: 00010006
RAX: 0000000000000000 RBX: 0000000000000000 RCX: dead000000200200
RDX: ffff880001843f68 RSI: dead000000100100 RDI: 0000000000000000
RBP: ffff880001843f48 R08: 0000000000000000 R09: ffff880001843f08
R10: ffffffff8102c9a5 R11: ffff88000184ea80 R12: 0000000000000000
R13: ffff88000184f6c0 R14: 0000000000000000 R15: 0000000000000000
FS:  00007fec6a92e6f0(0000) GS:ffff880001840000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 000000000163b000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff88042fcd8000, task ffff88042fcd51d0)
Stack:
 ffff880001843f48 0000000000000001 ffff88042e9f7d38 ffff880001843f68
<0> ffff880001843f58 ffffffff8132a602 ffff880001843f98 ffffffff810521b3
<0> ffff880001843f68 ffff880001843f68 ffff880001843f88 ffff88042fcd9fd8
Call Trace:
 <IRQ>
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
Code: 89 e5 41 55 49 89 fd 41 54 45 31 e4 53 31 db 48 83 ec 08 89 df e8 be f8 ff ff 48 98 48 83 3c c5 10 67 7a 81 00 74 1f 49 8b 45 08 <42> 8b 0c 20 0f 32 48 c1 e2 20 25 ff ff bf ff 48 09 d0 48 89 c2
RIP  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
 RSP <ffff880001843f28>
CR2: 0000000000000000
---[ end trace 679ac372d674b757 ]---
Kernel panic - not syncing: Fatal exception in interrupt
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff813ebd6a>] panic+0x9e/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] oops_end+0x81/0x8e
 [<ffffffff8101efee>] no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
------------[ cut here ]------------
WARNING: at /local/rrichter/.source/linux/arch/x86/kernel/smp.c:118 native_smp_send_reschedule+0x27/0x53()
Hardware name: Anaheim
Modules linked in:
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff81017f32>] ? native_smp_send_reschedule+0x27/0x53
 [<ffffffff81030ee2>] warn_slowpath_common+0x77/0xa4
 [<ffffffff81030f1e>] warn_slowpath_null+0xf/0x11
 [<ffffffff81017f32>] native_smp_send_reschedule+0x27/0x53
 [<ffffffff8102634b>] resched_task+0x60/0x62
 [<ffffffff8102653a>] check_preempt_curr_idle+0x10/0x12
 [<ffffffff8102c8ea>] try_to_wake_up+0x1f5/0x284
 [<ffffffff8102c986>] default_wake_function+0xd/0xf
 [<ffffffff810a110d>] pollwake+0x57/0x5a
 [<ffffffff8102c979>] ? default_wake_function+0x0/0xf
 [<ffffffff81026be5>] __wake_up_common+0x46/0x75
 [<ffffffff81026ed0>] __wake_up+0x38/0x50
 [<ffffffff81031694>] printk_tick+0x39/0x3b
 [<ffffffff8103ac37>] update_process_times+0x3f/0x5c
 [<ffffffff8104dc63>] tick_periodic+0x5d/0x69
 [<ffffffff8104dc90>] tick_handle_periodic+0x21/0x71
 [<ffffffff81018fd0>] smp_apic_timer_interrupt+0x82/0x95
 [<ffffffff81002853>] apic_timer_interrupt+0x13/0x20
 [<ffffffff81030cb5>] ? panic_blink_one_second+0x0/0x7b
 [<ffffffff813ebdd6>] ? panic+0x10a/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] ? oops_end+0x81/0x8e
 [<ffffffff8101efee>] ? no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] ? __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] ? bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] ? do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] ? page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] ? nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] ? generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] ? smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] ? call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] ? c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] ? cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] ? start_secondary+0x1ae/0x1b2
---[ end trace 679ac372d674b758 ]---

Cc: Andi Kleen <andi@firstfloor.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 9f001d90459..24582040b71 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -95,7 +95,10 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
 static void nmi_cpu_start(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->start(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->start(msrs);
 }
 
 static int nmi_start(void)
@@ -107,7 +110,10 @@ static int nmi_start(void)
 static void nmi_cpu_stop(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->stop(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->stop(msrs);
 }
 
 static void nmi_stop(void)
-- 
cgit v1.2.3


From 216f3d9b4e5121feea4b13fae9d4c83e8d7e1c8a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 11:58:46 +0200
Subject: oprofile/x86: remove CONFIG_SMP macros

CPU notifier register functions also exist if CONFIG_SMP is
disabled. This change is part of hotplug code rework and also
necessary for later patches.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 24582040b71..c5df8ee76ee 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -471,7 +471,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
 	return 0;
 }
 
-#ifdef CONFIG_SMP
 static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 				 void *data)
 {
@@ -491,7 +490,6 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
-#endif
 
 #ifdef CONFIG_PM
 
@@ -701,9 +699,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-#ifdef CONFIG_SMP
 	register_cpu_notifier(&oprofile_cpu_nb);
-#endif
+
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -732,9 +729,7 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
-#ifdef CONFIG_SMP
 		unregister_cpu_notifier(&oprofile_cpu_nb);
-#endif
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3


From 6ae56b55bc364bc2f2342f599b46581627ba22da Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 29 Apr 2010 14:55:55 +0200
Subject: oprofile/x86: protect cpu hotplug sections

This patch reworks oprofile cpu hotplug code as follows:

Introduce ctr_running variable to check, if counters are running or
not. The state must be known for taking a cpu on or offline and when
switching counters during counter multiplexing.

Protect on_each_cpu() sections with get_online_cpus()/put_online_cpu()
functions. This is necessary if notifiers or states are
modified. Within these sections the cpu mask may not change.

Switch only between counters in nmi_cpu_switch(), if counters are
running. Otherwise the switch may restart a counter though they are
disabled.

Add nmi_cpu_setup() and nmi_cpu_shutdown() to cpu hotplug code. The
function must also be called to avoid uninitialzed counter usage.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 52 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c5df8ee76ee..b56601eaf29 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
 static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
 static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
 
-/* 0 == registered but off, 1 == registered and on */
-static int nmi_enabled = 0;
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
 
@@ -103,7 +104,10 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_start, NULL, 1);
+	ctr_running = 1;
+	put_online_cpus();
 	return 0;
 }
 
@@ -118,7 +122,10 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_stop, NULL, 1);
+	ctr_running = 0;
+	put_online_cpus();
 }
 
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -258,7 +265,10 @@ static int nmi_switch_event(void)
 	if (nmi_multiplex_on() < 0)
 		return -EINVAL;		/* not necessary */
 
-	on_each_cpu(nmi_cpu_switch, NULL, 1);
+	get_online_cpus();
+	if (ctr_running)
+		on_each_cpu(nmi_cpu_switch, NULL, 1);
+	put_online_cpus();
 
 	return 0;
 }
@@ -386,8 +396,11 @@ static int nmi_setup(void)
 	if (err)
 		goto fail;
 
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
+	put_online_cpus();
+
 	return 0;
 fail:
 	free_msrs();
@@ -433,8 +446,11 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs;
 
-	nmi_enabled = 0;
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
@@ -442,6 +458,22 @@ static void nmi_shutdown(void)
 	put_cpu_var(cpu_msrs);
 }
 
+static void nmi_cpu_up(void *dummy)
+{
+	if (nmi_enabled)
+		nmi_cpu_setup(dummy);
+	if (ctr_running)
+		nmi_cpu_start(dummy);
+}
+
+static void nmi_cpu_down(void *dummy)
+{
+	if (ctr_running)
+		nmi_cpu_stop(dummy);
+	if (nmi_enabled)
+		nmi_cpu_shutdown(dummy);
+}
+
 static int nmi_create_files(struct super_block *sb, struct dentry *root)
 {
 	unsigned int i;
@@ -478,10 +510,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 	switch (action) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+		smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+		smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
 		break;
 	}
 	return NOTIFY_DONE;
@@ -699,7 +731,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
+	get_online_cpus();
 	register_cpu_notifier(&oprofile_cpu_nb);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
@@ -729,7 +765,11 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
+		get_online_cpus();
 		unregister_cpu_notifier(&oprofile_cpu_nb);
+		nmi_enabled = 0;
+		ctr_running = 0;
+		put_online_cpus();
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3


From de654649737696ecf32873c341b305e30f3dc777 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 14:41:22 +0200
Subject: oprofile/x86: stop disabled counters in nmi handler

This patch adds checks to the nmi handler. Now samples are only
generated and counters reenabled, if the counters are running.
Otherwise the counters are stopped, if oprofile is using the nmi. In
other cases it will ignore the nmi notification.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b56601eaf29..94b5481bb6c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -62,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
 {
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
-	int cpu = smp_processor_id();
 
 	switch (val) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		if (ctr_running)
+			model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
+		else if (!nmi_enabled)
+			break;
+		else
+			model->stop(&__get_cpu_var(cpu_msrs));
 		ret = NOTIFY_STOP;
 		break;
 	default:
@@ -392,6 +396,9 @@ static int nmi_setup(void)
 		mux_clone(cpu);
 	}
 
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err)
 		goto fail;
@@ -451,6 +458,7 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	ctr_running = 0;
 	put_online_cpus();
+	barrier();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
-- 
cgit v1.2.3


From d30d64c6da3ec7a0708bfffa7e05752d5b9a1093 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 15:52:26 +0200
Subject: oprofile/x86: reordering some functions

Reordering some functions. Necessary for the next patch. No functional
changes.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 134 ++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 94b5481bb6c..7de0572b0a5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -364,56 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
 	.priority = 2
 };
 
-static int nmi_setup(void)
-{
-	int err = 0;
-	int cpu;
-
-	if (!allocate_msrs())
-		return -ENOMEM;
-
-	/* We need to serialize save and setup for HT because the subset
-	 * of msrs are distinct for save and setup operations
-	 */
-
-	/* Assume saved/restored counters are the same on all CPUs */
-	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
-	if (err)
-		goto fail;
-
-	for_each_possible_cpu(cpu) {
-		if (!cpu)
-			continue;
-
-		memcpy(per_cpu(cpu_msrs, cpu).counters,
-		       per_cpu(cpu_msrs, 0).counters,
-		       sizeof(struct op_msr) * model->num_counters);
-
-		memcpy(per_cpu(cpu_msrs, cpu).controls,
-		       per_cpu(cpu_msrs, 0).controls,
-		       sizeof(struct op_msr) * model->num_controls);
-
-		mux_clone(cpu);
-	}
-
-	nmi_enabled = 0;
-	ctr_running = 0;
-	barrier();
-	err = register_die_notifier(&profile_exceptions_nb);
-	if (err)
-		goto fail;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_setup, NULL, 1);
-	nmi_enabled = 1;
-	put_online_cpus();
-
-	return 0;
-fail:
-	free_msrs();
-	return err;
-}
-
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
 {
 	struct op_msr *counters = msrs->counters;
@@ -449,23 +399,6 @@ static void nmi_cpu_shutdown(void *dummy)
 	nmi_cpu_restore_registers(msrs);
 }
 
-static void nmi_shutdown(void)
-{
-	struct op_msrs *msrs;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-	barrier();
-	unregister_die_notifier(&profile_exceptions_nb);
-	msrs = &get_cpu_var(cpu_msrs);
-	model->shutdown(msrs);
-	free_msrs();
-	put_cpu_var(cpu_msrs);
-}
-
 static void nmi_cpu_up(void *dummy)
 {
 	if (nmi_enabled)
@@ -531,6 +464,73 @@ static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
 
+static int nmi_setup(void)
+{
+	int err = 0;
+	int cpu;
+
+	if (!allocate_msrs())
+		return -ENOMEM;
+
+	/* We need to serialize save and setup for HT because the subset
+	 * of msrs are distinct for save and setup operations
+	 */
+
+	/* Assume saved/restored counters are the same on all CPUs */
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpu)
+			continue;
+
+		memcpy(per_cpu(cpu_msrs, cpu).counters,
+		       per_cpu(cpu_msrs, 0).counters,
+		       sizeof(struct op_msr) * model->num_counters);
+
+		memcpy(per_cpu(cpu_msrs, cpu).controls,
+		       per_cpu(cpu_msrs, 0).controls,
+		       sizeof(struct op_msr) * model->num_controls);
+
+		mux_clone(cpu);
+	}
+
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
+	nmi_enabled = 1;
+	put_online_cpus();
+
+	return 0;
+fail:
+	free_msrs();
+	return err;
+}
+
+static void nmi_shutdown(void)
+{
+	struct op_msrs *msrs;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
+	barrier();
+	unregister_die_notifier(&profile_exceptions_nb);
+	msrs = &get_cpu_var(cpu_msrs);
+	model->shutdown(msrs);
+	free_msrs();
+	put_cpu_var(cpu_msrs);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_suspend(struct sys_device *dev, pm_message_t state)
-- 
cgit v1.2.3


From 3de668ee8d5b1e08da3200f926ff5a28aeb99bc2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 15:00:25 +0200
Subject: oprofile/x86: notify cpus only when daemon is running

This patch moves the cpu notifier registration from nmi_init() to
nmi_setup(). The corresponding unregistration function is now in
nmi_shutdown(). Thus, the hotplug code is only active, if the oprofile
daemon is running.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7de0572b0a5..2a086726cad 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -504,6 +504,7 @@ static int nmi_setup(void)
 		goto fail;
 
 	get_online_cpus();
+	register_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	put_online_cpus();
@@ -519,6 +520,7 @@ static void nmi_shutdown(void)
 	struct op_msrs *msrs;
 
 	get_online_cpus();
+	unregister_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	nmi_enabled = 0;
 	ctr_running = 0;
@@ -739,12 +741,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-	get_online_cpus();
-	register_cpu_notifier(&oprofile_cpu_nb);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -771,14 +767,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 
 void op_nmi_exit(void)
 {
-	if (using_nmi) {
+	if (using_nmi)
 		exit_sysfs();
-		get_online_cpus();
-		unregister_cpu_notifier(&oprofile_cpu_nb);
-		nmi_enabled = 0;
-		ctr_running = 0;
-		put_online_cpus();
-	}
 	if (model->exit)
 		model->exit();
 }
-- 
cgit v1.2.3


From bae663bc635e2726c7c5228dbf0f2051e16d1c81 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 5 May 2010 17:47:17 +0200
Subject: oprofile/x86: make AMD IBS hotplug capable

Current IBS code is not hotplug capable. An offline cpu might not be
initialized or deinitialized properly. This patch fixes this by
removing on_each_cpu() functions. The IBS init/deinit code is executed
in the per-cpu functions model->setup_ctrs() and model->cpu_down()
which are also called by hotplug notifiers. model->cpu_down() replaces
model->exit() that became obsolete.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c      |  4 +--
 arch/x86/oprofile/op_model_amd.c | 54 ++++++++++++----------------------------
 arch/x86/oprofile/op_x86_model.h |  2 +-
 3 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2a086726cad..b28d2f1253b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -397,6 +397,8 @@ static void nmi_cpu_shutdown(void *dummy)
 	apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
 	apic_write(APIC_LVTERR, v);
 	nmi_cpu_restore_registers(msrs);
+	if (model->cpu_down)
+		model->cpu_down();
 }
 
 static void nmi_cpu_up(void *dummy)
@@ -769,6 +771,4 @@ void op_nmi_exit(void)
 {
 	if (using_nmi)
 		exit_sysfs();
-	if (model->exit)
-		model->exit();
 }
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 384c5241048..b67a6b5aa8d 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -374,6 +374,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 		val |= op_x86_get_ctrl(model, &counter_config[virt]);
 		wrmsrl(msrs->controls[i].addr, val);
 	}
+
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static void op_amd_cpu_shutdown(void)
+{
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 }
 
 static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -436,28 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static u8 ibs_eilvt_off;
-
-static inline void apic_init_ibs_nmi_per_cpu(void *arg)
-{
-	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
-}
-
-static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
-{
-	setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
-}
-
-static int init_ibs_nmi(void)
+static int __init_ibs_nmi(void)
 {
 #define IBSCTL_LVTOFFSETVAL		(1 << 8)
 #define IBSCTL				0x1cc
 	struct pci_dev *cpu_cfg;
 	int nodes;
 	u32 value = 0;
+	u8 ibs_eilvt_off;
 
-	/* per CPU setup */
-	on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 
 	nodes = 0;
 	cpu_cfg = NULL;
@@ -487,21 +484,15 @@ static int init_ibs_nmi(void)
 	return 0;
 }
 
-/* uninitialize the APIC for the IBS interrupts if needed */
-static void clear_ibs_nmi(void)
-{
-	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
-}
-
 /* initialize the APIC for the IBS interrupts if available */
-static void ibs_init(void)
+static void init_ibs(void)
 {
 	ibs_caps = get_ibs_caps();
 
 	if (!ibs_caps)
 		return;
 
-	if (init_ibs_nmi()) {
+	if (__init_ibs_nmi()) {
 		ibs_caps = 0;
 		return;
 	}
@@ -510,14 +501,6 @@ static void ibs_init(void)
 	       (unsigned)ibs_caps);
 }
 
-static void ibs_exit(void)
-{
-	if (!ibs_caps)
-		return;
-
-	clear_ibs_nmi();
-}
-
 static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
 
 static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -566,17 +549,12 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 
 static int op_amd_init(struct oprofile_operations *ops)
 {
-	ibs_init();
+	init_ibs();
 	create_arch_files = ops->create_files;
 	ops->create_files = setup_ibs_files;
 	return 0;
 }
 
-static void op_amd_exit(void)
-{
-	ibs_exit();
-}
-
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
 	.num_controls		= NUM_COUNTERS,
@@ -584,9 +562,9 @@ struct op_x86_model_spec op_amd_spec = {
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
 	.init			= op_amd_init,
-	.exit			= op_amd_exit,
 	.fill_in_addresses	= &op_amd_fill_in_addresses,
 	.setup_ctrs		= &op_amd_setup_ctrs,
+	.cpu_down		= &op_amd_cpu_shutdown,
 	.check_ctrs		= &op_amd_check_ctrs,
 	.start			= &op_amd_start,
 	.stop			= &op_amd_stop,
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 551401398fb..89017fa1fd6 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
 	u64		reserved;
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
-	void		(*exit)(void);
 	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
+	void		(*cpu_down)(void);
 	int		(*check_ctrs)(struct pt_regs * const regs,
 				      struct op_msrs const * const msrs);
 	void		(*start)(struct op_msrs const * const msrs);
-- 
cgit v1.2.3