blob: 92b8811f2234924566cfaae062ae023c1d0eb336 [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
Ingo Molnare7e7ee22011-05-04 08:42:29 +02005 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
Peter Zijlstra2e80a822010-11-17 23:17:36 +010016#include <linux/idr.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020017#include <linux/file.h>
18#include <linux/poll.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Frederic Weisbecker76e1d902010-04-05 15:35:57 +020020#include <linux/hash.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020021#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
Peter Zijlstrac2774432010-12-08 15:29:02 +010025#include <linux/reboot.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020026#include <linux/vmstat.h>
Peter Zijlstraabe43402010-11-17 23:17:37 +010027#include <linux/device.h>
Paul Gortmaker6e5fdee2011-05-26 16:00:52 -040028#include <linux/export.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020029#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020030#include <linux/hardirq.h>
31#include <linux/rculist.h>
32#include <linux/uaccess.h>
Peter Zijlstra144060f2011-08-01 12:49:14 +020033#include <linux/suspend.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020034#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080038#include <linux/ftrace_event.h>
Jason Wessel3c502e72010-11-04 17:33:01 -050039#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020040
Frederic Weisbecker76369132011-05-19 19:55:04 +020041#include "internal.h"
42
Ingo Molnarcdd6c482009-09-21 12:02:48 +020043#include <asm/irq_regs.h>
44
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010045struct remote_function_call {
Ingo Molnare7e7ee22011-05-04 08:42:29 +020046 struct task_struct *p;
47 int (*func)(void *info);
48 void *info;
49 int ret;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010050};
51
52static void remote_function(void *data)
53{
54 struct remote_function_call *tfc = data;
55 struct task_struct *p = tfc->p;
56
57 if (p) {
58 tfc->ret = -EAGAIN;
59 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
60 return;
61 }
62
63 tfc->ret = tfc->func(tfc->info);
64}
65
66/**
67 * task_function_call - call a function on the cpu on which a task runs
68 * @p: the task to evaluate
69 * @func: the function to be called
70 * @info: the function call argument
71 *
72 * Calls the function @func when the task is currently running. This might
73 * be on the current CPU, which just calls the function directly
74 *
75 * returns: @func return value, or
76 * -ESRCH - when the process isn't running
77 * -EAGAIN - when the process moved away
78 */
79static int
80task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
81{
82 struct remote_function_call data = {
Ingo Molnare7e7ee22011-05-04 08:42:29 +020083 .p = p,
84 .func = func,
85 .info = info,
86 .ret = -ESRCH, /* No such (running) process */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010087 };
88
89 if (task_curr(p))
90 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
91
92 return data.ret;
93}
94
95/**
96 * cpu_function_call - call a function on the cpu
97 * @func: the function to be called
98 * @info: the function call argument
99 *
100 * Calls the function @func on the remote cpu.
101 *
102 * returns: @func return value or -ENXIO when the cpu is offline
103 */
104static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
105{
106 struct remote_function_call data = {
Ingo Molnare7e7ee22011-05-04 08:42:29 +0200107 .p = NULL,
108 .func = func,
109 .info = info,
110 .ret = -ENXIO, /* No such CPU */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +0100111 };
112
113 smp_call_function_single(cpu, remote_function, &data, 1);
114
115 return data.ret;
116}
117
Stephane Eraniane5d13672011-02-14 11:20:01 +0200118#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
119 PERF_FLAG_FD_OUTPUT |\
120 PERF_FLAG_PID_CGROUP)
121
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200122enum event_type_t {
123 EVENT_FLEXIBLE = 0x1,
124 EVENT_PINNED = 0x2,
125 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
126};
127
Stephane Eraniane5d13672011-02-14 11:20:01 +0200128/*
129 * perf_sched_events : >0 events exist
130 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
131 */
Jason Barond430d3d2011-03-16 17:29:47 -0400132struct jump_label_key perf_sched_events __read_mostly;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200133static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
134
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200135static atomic_t nr_mmap_events __read_mostly;
136static atomic_t nr_comm_events __read_mostly;
137static atomic_t nr_task_events __read_mostly;
138
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200139static LIST_HEAD(pmus);
140static DEFINE_MUTEX(pmus_lock);
141static struct srcu_struct pmus_srcu;
142
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200143/*
144 * perf event paranoia level:
145 * -1 - not paranoid at all
146 * 0 - disallow raw tracepoint access for unpriv
147 * 1 - disallow cpu events for unpriv
148 * 2 - disallow kernel profiling for unpriv
149 */
150int sysctl_perf_event_paranoid __read_mostly = 1;
151
Frederic Weisbecker20443382011-03-31 03:33:29 +0200152/* Minimum for 512 kiB + 1 user control page */
153int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200154
155/*
156 * max perf event sample rate
157 */
Peter Zijlstra163ec432011-02-16 11:22:34 +0100158#define DEFAULT_MAX_SAMPLE_RATE 100000
159int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
160static int max_samples_per_tick __read_mostly =
161 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
162
163int perf_proc_update_handler(struct ctl_table *table, int write,
164 void __user *buffer, size_t *lenp,
165 loff_t *ppos)
166{
167 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
168
169 if (ret || !write)
170 return ret;
171
172 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
173
174 return 0;
175}
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200176
177static atomic64_t perf_event_id;
178
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200179static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
180 enum event_type_t event_type);
181
182static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +0200183 enum event_type_t event_type,
184 struct task_struct *task);
185
186static void update_context_time(struct perf_event_context *ctx);
187static u64 perf_event_time(struct perf_event *event);
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200188
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200189void __weak perf_event_print_debug(void) { }
190
Matt Fleming84c79912010-10-03 21:41:13 +0100191extern __weak const char *perf_pmu_name(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200192{
Matt Fleming84c79912010-10-03 21:41:13 +0100193 return "pmu";
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200194}
195
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200196static inline u64 perf_clock(void)
197{
198 return local_clock();
199}
200
Stephane Eraniane5d13672011-02-14 11:20:01 +0200201static inline struct perf_cpu_context *
202__get_cpu_context(struct perf_event_context *ctx)
203{
204 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
205}
206
Peter Zijlstrafacc4302011-04-09 21:17:42 +0200207static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
208 struct perf_event_context *ctx)
209{
210 raw_spin_lock(&cpuctx->ctx.lock);
211 if (ctx)
212 raw_spin_lock(&ctx->lock);
213}
214
215static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
216 struct perf_event_context *ctx)
217{
218 if (ctx)
219 raw_spin_unlock(&ctx->lock);
220 raw_spin_unlock(&cpuctx->ctx.lock);
221}
222
Stephane Eraniane5d13672011-02-14 11:20:01 +0200223#ifdef CONFIG_CGROUP_PERF
224
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200225/*
226 * Must ensure cgroup is pinned (css_get) before calling
227 * this function. In other words, we cannot call this function
228 * if there is no cgroup event for the current CPU context.
229 */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200230static inline struct perf_cgroup *
231perf_cgroup_from_task(struct task_struct *task)
232{
233 return container_of(task_subsys_state(task, perf_subsys_id),
234 struct perf_cgroup, css);
235}
236
237static inline bool
238perf_cgroup_match(struct perf_event *event)
239{
240 struct perf_event_context *ctx = event->ctx;
241 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
242
243 return !event->cgrp || event->cgrp == cpuctx->cgrp;
244}
245
246static inline void perf_get_cgroup(struct perf_event *event)
247{
248 css_get(&event->cgrp->css);
249}
250
251static inline void perf_put_cgroup(struct perf_event *event)
252{
253 css_put(&event->cgrp->css);
254}
255
256static inline void perf_detach_cgroup(struct perf_event *event)
257{
258 perf_put_cgroup(event);
259 event->cgrp = NULL;
260}
261
262static inline int is_cgroup_event(struct perf_event *event)
263{
264 return event->cgrp != NULL;
265}
266
267static inline u64 perf_cgroup_event_time(struct perf_event *event)
268{
269 struct perf_cgroup_info *t;
270
271 t = per_cpu_ptr(event->cgrp->info, event->cpu);
272 return t->time;
273}
274
275static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
276{
277 struct perf_cgroup_info *info;
278 u64 now;
279
280 now = perf_clock();
281
282 info = this_cpu_ptr(cgrp->info);
283
284 info->time += now - info->timestamp;
285 info->timestamp = now;
286}
287
288static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
289{
290 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
291 if (cgrp_out)
292 __update_cgrp_time(cgrp_out);
293}
294
295static inline void update_cgrp_time_from_event(struct perf_event *event)
296{
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200297 struct perf_cgroup *cgrp;
298
Stephane Eraniane5d13672011-02-14 11:20:01 +0200299 /*
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200300 * ensure we access cgroup data only when needed and
301 * when we know the cgroup is pinned (css_get)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200302 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200303 if (!is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200304 return;
305
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200306 cgrp = perf_cgroup_from_task(current);
307 /*
308 * Do not update time when cgroup is not active
309 */
310 if (cgrp == event->cgrp)
311 __update_cgrp_time(event->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200312}
313
314static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200315perf_cgroup_set_timestamp(struct task_struct *task,
316 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200317{
318 struct perf_cgroup *cgrp;
319 struct perf_cgroup_info *info;
320
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200321 /*
322 * ctx->lock held by caller
323 * ensure we do not access cgroup data
324 * unless we have the cgroup pinned (css_get)
325 */
326 if (!task || !ctx->nr_cgroups)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200327 return;
328
329 cgrp = perf_cgroup_from_task(task);
330 info = this_cpu_ptr(cgrp->info);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200331 info->timestamp = ctx->timestamp;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200332}
333
334#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
335#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
336
337/*
338 * reschedule events based on the cgroup constraint of task.
339 *
340 * mode SWOUT : schedule out everything
341 * mode SWIN : schedule in based on cgroup for next
342 */
343void perf_cgroup_switch(struct task_struct *task, int mode)
344{
345 struct perf_cpu_context *cpuctx;
346 struct pmu *pmu;
347 unsigned long flags;
348
349 /*
350 * disable interrupts to avoid geting nr_cgroup
351 * changes via __perf_event_disable(). Also
352 * avoids preemption.
353 */
354 local_irq_save(flags);
355
356 /*
357 * we reschedule only in the presence of cgroup
358 * constrained events.
359 */
360 rcu_read_lock();
361
362 list_for_each_entry_rcu(pmu, &pmus, entry) {
Stephane Eraniane5d13672011-02-14 11:20:01 +0200363 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
364
Stephane Eraniane5d13672011-02-14 11:20:01 +0200365 /*
366 * perf_cgroup_events says at least one
367 * context on this CPU has cgroup events.
368 *
369 * ctx->nr_cgroups reports the number of cgroup
370 * events for a context.
371 */
372 if (cpuctx->ctx.nr_cgroups > 0) {
Peter Zijlstrafacc4302011-04-09 21:17:42 +0200373 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
374 perf_pmu_disable(cpuctx->ctx.pmu);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200375
376 if (mode & PERF_CGROUP_SWOUT) {
377 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
378 /*
379 * must not be done before ctxswout due
380 * to event_filter_match() in event_sched_out()
381 */
382 cpuctx->cgrp = NULL;
383 }
384
385 if (mode & PERF_CGROUP_SWIN) {
Stephane Eraniane566b762011-04-06 02:54:54 +0200386 WARN_ON_ONCE(cpuctx->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200387 /* set cgrp before ctxsw in to
388 * allow event_filter_match() to not
389 * have to pass task around
390 */
391 cpuctx->cgrp = perf_cgroup_from_task(task);
392 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
393 }
Peter Zijlstrafacc4302011-04-09 21:17:42 +0200394 perf_pmu_enable(cpuctx->ctx.pmu);
395 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200396 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200397 }
398
399 rcu_read_unlock();
400
401 local_irq_restore(flags);
402}
403
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200404static inline void perf_cgroup_sched_out(struct task_struct *task,
405 struct task_struct *next)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200406{
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200407 struct perf_cgroup *cgrp1;
408 struct perf_cgroup *cgrp2 = NULL;
409
410 /*
411 * we come here when we know perf_cgroup_events > 0
412 */
413 cgrp1 = perf_cgroup_from_task(task);
414
415 /*
416 * next is NULL when called from perf_event_enable_on_exec()
417 * that will systematically cause a cgroup_switch()
418 */
419 if (next)
420 cgrp2 = perf_cgroup_from_task(next);
421
422 /*
423 * only schedule out current cgroup events if we know
424 * that we are switching to a different cgroup. Otherwise,
425 * do no touch the cgroup events.
426 */
427 if (cgrp1 != cgrp2)
428 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200429}
430
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200431static inline void perf_cgroup_sched_in(struct task_struct *prev,
432 struct task_struct *task)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200433{
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200434 struct perf_cgroup *cgrp1;
435 struct perf_cgroup *cgrp2 = NULL;
436
437 /*
438 * we come here when we know perf_cgroup_events > 0
439 */
440 cgrp1 = perf_cgroup_from_task(task);
441
442 /* prev can never be NULL */
443 cgrp2 = perf_cgroup_from_task(prev);
444
445 /*
446 * only need to schedule in cgroup events if we are changing
447 * cgroup during ctxsw. Cgroup events were not scheduled
448 * out of ctxsw out if that was not the case.
449 */
450 if (cgrp1 != cgrp2)
451 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200452}
453
454static inline int perf_cgroup_connect(int fd, struct perf_event *event,
455 struct perf_event_attr *attr,
456 struct perf_event *group_leader)
457{
458 struct perf_cgroup *cgrp;
459 struct cgroup_subsys_state *css;
460 struct file *file;
461 int ret = 0, fput_needed;
462
463 file = fget_light(fd, &fput_needed);
464 if (!file)
465 return -EBADF;
466
467 css = cgroup_css_from_dir(file, perf_subsys_id);
Li Zefan3db272c2011-03-03 14:25:37 +0800468 if (IS_ERR(css)) {
469 ret = PTR_ERR(css);
470 goto out;
471 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200472
473 cgrp = container_of(css, struct perf_cgroup, css);
474 event->cgrp = cgrp;
475
Li Zefanf75e18c2011-03-03 14:25:50 +0800476 /* must be done before we fput() the file */
477 perf_get_cgroup(event);
478
Stephane Eraniane5d13672011-02-14 11:20:01 +0200479 /*
480 * all events in a group must monitor
481 * the same cgroup because a task belongs
482 * to only one perf cgroup at a time
483 */
484 if (group_leader && group_leader->cgrp != cgrp) {
485 perf_detach_cgroup(event);
486 ret = -EINVAL;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200487 }
Li Zefan3db272c2011-03-03 14:25:37 +0800488out:
Stephane Eraniane5d13672011-02-14 11:20:01 +0200489 fput_light(file, fput_needed);
490 return ret;
491}
492
493static inline void
494perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
495{
496 struct perf_cgroup_info *t;
497 t = per_cpu_ptr(event->cgrp->info, event->cpu);
498 event->shadow_ctx_time = now - t->timestamp;
499}
500
501static inline void
502perf_cgroup_defer_enabled(struct perf_event *event)
503{
504 /*
505 * when the current task's perf cgroup does not match
506 * the event's, we need to remember to call the
507 * perf_mark_enable() function the first time a task with
508 * a matching perf cgroup is scheduled in.
509 */
510 if (is_cgroup_event(event) && !perf_cgroup_match(event))
511 event->cgrp_defer_enabled = 1;
512}
513
514static inline void
515perf_cgroup_mark_enabled(struct perf_event *event,
516 struct perf_event_context *ctx)
517{
518 struct perf_event *sub;
519 u64 tstamp = perf_event_time(event);
520
521 if (!event->cgrp_defer_enabled)
522 return;
523
524 event->cgrp_defer_enabled = 0;
525
526 event->tstamp_enabled = tstamp - event->total_time_enabled;
527 list_for_each_entry(sub, &event->sibling_list, group_entry) {
528 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
529 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
530 sub->cgrp_defer_enabled = 0;
531 }
532 }
533}
534#else /* !CONFIG_CGROUP_PERF */
535
536static inline bool
537perf_cgroup_match(struct perf_event *event)
538{
539 return true;
540}
541
542static inline void perf_detach_cgroup(struct perf_event *event)
543{}
544
545static inline int is_cgroup_event(struct perf_event *event)
546{
547 return 0;
548}
549
550static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
551{
552 return 0;
553}
554
555static inline void update_cgrp_time_from_event(struct perf_event *event)
556{
557}
558
559static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
560{
561}
562
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200563static inline void perf_cgroup_sched_out(struct task_struct *task,
564 struct task_struct *next)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200565{
566}
567
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200568static inline void perf_cgroup_sched_in(struct task_struct *prev,
569 struct task_struct *task)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200570{
571}
572
573static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
574 struct perf_event_attr *attr,
575 struct perf_event *group_leader)
576{
577 return -EINVAL;
578}
579
580static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200581perf_cgroup_set_timestamp(struct task_struct *task,
582 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200583{
584}
585
586void
587perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
588{
589}
590
591static inline void
592perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
593{
594}
595
596static inline u64 perf_cgroup_event_time(struct perf_event *event)
597{
598 return 0;
599}
600
601static inline void
602perf_cgroup_defer_enabled(struct perf_event *event)
603{
604}
605
606static inline void
607perf_cgroup_mark_enabled(struct perf_event *event,
608 struct perf_event_context *ctx)
609{
610}
611#endif
612
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200613void perf_pmu_disable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200614{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200615 int *count = this_cpu_ptr(pmu->pmu_disable_count);
616 if (!(*count)++)
617 pmu->pmu_disable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200618}
619
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200620void perf_pmu_enable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200621{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200622 int *count = this_cpu_ptr(pmu->pmu_disable_count);
623 if (!--(*count))
624 pmu->pmu_enable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200625}
626
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200627static DEFINE_PER_CPU(struct list_head, rotation_list);
628
629/*
630 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
631 * because they're strictly cpu affine and rotate_start is called with IRQs
632 * disabled, while rotate_context is called from IRQ context.
633 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200634static void perf_pmu_rotate_start(struct pmu *pmu)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200635{
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200636 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200637 struct list_head *head = &__get_cpu_var(rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200638
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200639 WARN_ON(!irqs_disabled());
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200640
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200641 if (list_empty(&cpuctx->rotation_list))
642 list_add(&cpuctx->rotation_list, head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200643}
644
645static void get_ctx(struct perf_event_context *ctx)
646{
647 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
648}
649
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200650static void put_ctx(struct perf_event_context *ctx)
651{
652 if (atomic_dec_and_test(&ctx->refcount)) {
653 if (ctx->parent_ctx)
654 put_ctx(ctx->parent_ctx);
655 if (ctx->task)
656 put_task_struct(ctx->task);
Lai Jiangshancb796ff2011-03-18 12:07:41 +0800657 kfree_rcu(ctx, rcu_head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200658 }
659}
660
661static void unclone_ctx(struct perf_event_context *ctx)
662{
663 if (ctx->parent_ctx) {
664 put_ctx(ctx->parent_ctx);
665 ctx->parent_ctx = NULL;
666 }
667}
668
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200669static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
670{
671 /*
672 * only top level events have the pid namespace they were created in
673 */
674 if (event->parent)
675 event = event->parent;
676
677 return task_tgid_nr_ns(p, event->ns);
678}
679
680static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
681{
682 /*
683 * only top level events have the pid namespace they were created in
684 */
685 if (event->parent)
686 event = event->parent;
687
688 return task_pid_nr_ns(p, event->ns);
689}
690
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200691/*
692 * If we inherit events we want to return the parent event id
693 * to userspace.
694 */
695static u64 primary_event_id(struct perf_event *event)
696{
697 u64 id = event->id;
698
699 if (event->parent)
700 id = event->parent->id;
701
702 return id;
703}
704
705/*
706 * Get the perf_event_context for a task and lock it.
707 * This has to cope with with the fact that until it is locked,
708 * the context could get moved to another task.
709 */
710static struct perf_event_context *
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200711perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200712{
713 struct perf_event_context *ctx;
714
715 rcu_read_lock();
Peter Zijlstra9ed60602010-06-11 17:36:35 +0200716retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200717 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200718 if (ctx) {
719 /*
720 * If this context is a clone of another, it might
721 * get swapped for another underneath us by
722 * perf_event_task_sched_out, though the
723 * rcu_read_lock() protects us from any context
724 * getting freed. Lock the context and check if it
725 * got swapped before we could get the lock, and retry
726 * if so. If we locked the right context, then it
727 * can't get swapped on us any more.
728 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100729 raw_spin_lock_irqsave(&ctx->lock, *flags);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200730 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100731 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200732 goto retry;
733 }
734
735 if (!atomic_inc_not_zero(&ctx->refcount)) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100736 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200737 ctx = NULL;
738 }
739 }
740 rcu_read_unlock();
741 return ctx;
742}
743
744/*
745 * Get the context for a task and increment its pin_count so it
746 * can't get swapped to another task. This also increments its
747 * reference count so that the context can't get freed.
748 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200749static struct perf_event_context *
750perf_pin_task_context(struct task_struct *task, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200751{
752 struct perf_event_context *ctx;
753 unsigned long flags;
754
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200755 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200756 if (ctx) {
757 ++ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100758 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200759 }
760 return ctx;
761}
762
763static void perf_unpin_context(struct perf_event_context *ctx)
764{
765 unsigned long flags;
766
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100767 raw_spin_lock_irqsave(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200768 --ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100769 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200770}
771
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100772/*
773 * Update the record of the current time in a context.
774 */
775static void update_context_time(struct perf_event_context *ctx)
776{
777 u64 now = perf_clock();
778
779 ctx->time += now - ctx->timestamp;
780 ctx->timestamp = now;
781}
782
Stephane Eranian41587552011-01-03 18:20:01 +0200783static u64 perf_event_time(struct perf_event *event)
784{
785 struct perf_event_context *ctx = event->ctx;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200786
787 if (is_cgroup_event(event))
788 return perf_cgroup_event_time(event);
789
Stephane Eranian41587552011-01-03 18:20:01 +0200790 return ctx ? ctx->time : 0;
791}
792
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100793/*
794 * Update the total_time_enabled and total_time_running fields for a event.
Eric B Munsonb7526f02011-06-23 16:34:37 -0400795 * The caller of this function needs to hold the ctx->lock.
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100796 */
797static void update_event_times(struct perf_event *event)
798{
799 struct perf_event_context *ctx = event->ctx;
800 u64 run_end;
801
802 if (event->state < PERF_EVENT_STATE_INACTIVE ||
803 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
804 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200805 /*
806 * in cgroup mode, time_enabled represents
807 * the time the event was enabled AND active
808 * tasks were in the monitored cgroup. This is
809 * independent of the activity of the context as
810 * there may be a mix of cgroup and non-cgroup events.
811 *
812 * That is why we treat cgroup events differently
813 * here.
814 */
815 if (is_cgroup_event(event))
Stephane Eranian41587552011-01-03 18:20:01 +0200816 run_end = perf_event_time(event);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200817 else if (ctx->is_active)
818 run_end = ctx->time;
Peter Zijlstraacd1d7c2009-11-23 15:00:36 +0100819 else
820 run_end = event->tstamp_stopped;
821
822 event->total_time_enabled = run_end - event->tstamp_enabled;
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100823
824 if (event->state == PERF_EVENT_STATE_INACTIVE)
825 run_end = event->tstamp_stopped;
826 else
Stephane Eranian41587552011-01-03 18:20:01 +0200827 run_end = perf_event_time(event);
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100828
829 event->total_time_running = run_end - event->tstamp_running;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200830
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100831}
832
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200833/*
834 * Update total_time_enabled and total_time_running for all events in a group.
835 */
836static void update_group_times(struct perf_event *leader)
837{
838 struct perf_event *event;
839
840 update_event_times(leader);
841 list_for_each_entry(event, &leader->sibling_list, group_entry)
842 update_event_times(event);
843}
844
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100845static struct list_head *
846ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
847{
848 if (event->attr.pinned)
849 return &ctx->pinned_groups;
850 else
851 return &ctx->flexible_groups;
852}
853
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200854/*
855 * Add a event from the lists for its context.
856 * Must be called with ctx->mutex and ctx->lock held.
857 */
858static void
859list_add_event(struct perf_event *event, struct perf_event_context *ctx)
860{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200861 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
862 event->attach_state |= PERF_ATTACH_CONTEXT;
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200863
864 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +0200865 * If we're a stand alone event or group leader, we go to the context
866 * list, group events are kept attached to the group so that
867 * perf_group_detach can, at all times, locate all siblings.
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200868 */
Peter Zijlstra8a495422010-05-27 15:47:49 +0200869 if (event->group_leader == event) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100870 struct list_head *list;
871
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +0100872 if (is_software_event(event))
873 event->group_flags |= PERF_GROUP_SOFTWARE;
874
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100875 list = ctx_group_list(event, ctx);
876 list_add_tail(&event->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200877 }
878
Peter Zijlstra08309372011-03-03 11:31:20 +0100879 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200880 ctx->nr_cgroups++;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200881
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200882 list_add_rcu(&event->event_entry, &ctx->event_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200883 if (!ctx->nr_events)
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200884 perf_pmu_rotate_start(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200885 ctx->nr_events++;
886 if (event->attr.inherit_stat)
887 ctx->nr_stat++;
888}
889
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200890/*
891 * Called at perf_event creation and when events are attached/detached from a
892 * group.
893 */
894static void perf_event__read_size(struct perf_event *event)
895{
896 int entry = sizeof(u64); /* value */
897 int size = 0;
898 int nr = 1;
899
900 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
901 size += sizeof(u64);
902
903 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
904 size += sizeof(u64);
905
906 if (event->attr.read_format & PERF_FORMAT_ID)
907 entry += sizeof(u64);
908
909 if (event->attr.read_format & PERF_FORMAT_GROUP) {
910 nr += event->group_leader->nr_siblings;
911 size += sizeof(u64);
912 }
913
914 size += entry * nr;
915 event->read_size = size;
916}
917
918static void perf_event__header_size(struct perf_event *event)
919{
920 struct perf_sample_data *data;
921 u64 sample_type = event->attr.sample_type;
922 u16 size = 0;
923
924 perf_event__read_size(event);
925
926 if (sample_type & PERF_SAMPLE_IP)
927 size += sizeof(data->ip);
928
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200929 if (sample_type & PERF_SAMPLE_ADDR)
930 size += sizeof(data->addr);
931
932 if (sample_type & PERF_SAMPLE_PERIOD)
933 size += sizeof(data->period);
934
935 if (sample_type & PERF_SAMPLE_READ)
936 size += event->read_size;
937
938 event->header_size = size;
939}
940
941static void perf_event__id_header_size(struct perf_event *event)
942{
943 struct perf_sample_data *data;
944 u64 sample_type = event->attr.sample_type;
945 u16 size = 0;
946
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200947 if (sample_type & PERF_SAMPLE_TID)
948 size += sizeof(data->tid_entry);
949
950 if (sample_type & PERF_SAMPLE_TIME)
951 size += sizeof(data->time);
952
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200953 if (sample_type & PERF_SAMPLE_ID)
954 size += sizeof(data->id);
955
956 if (sample_type & PERF_SAMPLE_STREAM_ID)
957 size += sizeof(data->stream_id);
958
959 if (sample_type & PERF_SAMPLE_CPU)
960 size += sizeof(data->cpu_entry);
961
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200962 event->id_header_size = size;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200963}
964
Peter Zijlstra8a495422010-05-27 15:47:49 +0200965static void perf_group_attach(struct perf_event *event)
966{
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200967 struct perf_event *group_leader = event->group_leader, *pos;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200968
Peter Zijlstra74c33372010-10-15 11:40:29 +0200969 /*
970 * We can have double attach due to group movement in perf_event_open.
971 */
972 if (event->attach_state & PERF_ATTACH_GROUP)
973 return;
974
Peter Zijlstra8a495422010-05-27 15:47:49 +0200975 event->attach_state |= PERF_ATTACH_GROUP;
976
977 if (group_leader == event)
978 return;
979
980 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
981 !is_software_event(event))
982 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
983
984 list_add_tail(&event->group_entry, &group_leader->sibling_list);
985 group_leader->nr_siblings++;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200986
987 perf_event__header_size(group_leader);
988
989 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
990 perf_event__header_size(pos);
Peter Zijlstra8a495422010-05-27 15:47:49 +0200991}
992
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200993/*
994 * Remove a event from the lists for its context.
995 * Must be called with ctx->mutex and ctx->lock held.
996 */
997static void
998list_del_event(struct perf_event *event, struct perf_event_context *ctx)
999{
Stephane Eranian68cacd22011-03-23 16:03:06 +01001000 struct perf_cpu_context *cpuctx;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001001 /*
1002 * We can have double detach due to exit/hot-unplug + close.
1003 */
1004 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001005 return;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001006
1007 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1008
Stephane Eranian68cacd22011-03-23 16:03:06 +01001009 if (is_cgroup_event(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001010 ctx->nr_cgroups--;
Stephane Eranian68cacd22011-03-23 16:03:06 +01001011 cpuctx = __get_cpu_context(ctx);
1012 /*
1013 * if there are no more cgroup events
1014 * then cler cgrp to avoid stale pointer
1015 * in update_cgrp_time_from_cpuctx()
1016 */
1017 if (!ctx->nr_cgroups)
1018 cpuctx->cgrp = NULL;
1019 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02001020
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001021 ctx->nr_events--;
1022 if (event->attr.inherit_stat)
1023 ctx->nr_stat--;
1024
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001025 list_del_rcu(&event->event_entry);
1026
Peter Zijlstra8a495422010-05-27 15:47:49 +02001027 if (event->group_leader == event)
1028 list_del_init(&event->group_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001029
Peter Zijlstra96c21a42010-05-11 16:19:10 +02001030 update_group_times(event);
Stephane Eranianb2e74a22009-11-26 09:24:30 -08001031
1032 /*
1033 * If event was in error state, then keep it
1034 * that way, otherwise bogus counts will be
1035 * returned on read(). The only way to get out
1036 * of error state is by explicit re-enabling
1037 * of the event
1038 */
1039 if (event->state > PERF_EVENT_STATE_OFF)
1040 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra050735b2010-05-11 11:51:53 +02001041}
1042
Peter Zijlstra8a495422010-05-27 15:47:49 +02001043static void perf_group_detach(struct perf_event *event)
Peter Zijlstra050735b2010-05-11 11:51:53 +02001044{
1045 struct perf_event *sibling, *tmp;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001046 struct list_head *list = NULL;
1047
1048 /*
1049 * We can have double detach due to exit/hot-unplug + close.
1050 */
1051 if (!(event->attach_state & PERF_ATTACH_GROUP))
1052 return;
1053
1054 event->attach_state &= ~PERF_ATTACH_GROUP;
1055
1056 /*
1057 * If this is a sibling, remove it from its group.
1058 */
1059 if (event->group_leader != event) {
1060 list_del_init(&event->group_entry);
1061 event->group_leader->nr_siblings--;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001062 goto out;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001063 }
1064
1065 if (!list_empty(&event->group_entry))
1066 list = &event->group_entry;
Peter Zijlstra2e2af502009-11-23 11:37:25 +01001067
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001068 /*
1069 * If this was a group event with sibling events then
1070 * upgrade the siblings to singleton events by adding them
Peter Zijlstra8a495422010-05-27 15:47:49 +02001071 * to whatever list we are on.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001072 */
1073 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
Peter Zijlstra8a495422010-05-27 15:47:49 +02001074 if (list)
1075 list_move_tail(&sibling->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001076 sibling->group_leader = sibling;
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001077
1078 /* Inherit group flags from the previous leader */
1079 sibling->group_flags = event->group_flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001080 }
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001081
1082out:
1083 perf_event__header_size(event->group_leader);
1084
1085 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1086 perf_event__header_size(tmp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001087}
1088
Stephane Eranianfa66f072010-08-26 16:40:01 +02001089static inline int
1090event_filter_match(struct perf_event *event)
1091{
Stephane Eraniane5d13672011-02-14 11:20:01 +02001092 return (event->cpu == -1 || event->cpu == smp_processor_id())
1093 && perf_cgroup_match(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001094}
1095
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001096static void
1097event_sched_out(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001098 struct perf_cpu_context *cpuctx,
1099 struct perf_event_context *ctx)
1100{
Stephane Eranian41587552011-01-03 18:20:01 +02001101 u64 tstamp = perf_event_time(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001102 u64 delta;
1103 /*
1104 * An event which could not be activated because of
1105 * filter mismatch still needs to have its timings
1106 * maintained, otherwise bogus information is return
1107 * via read() for time_enabled, time_running:
1108 */
1109 if (event->state == PERF_EVENT_STATE_INACTIVE
1110 && !event_filter_match(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001111 delta = tstamp - event->tstamp_stopped;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001112 event->tstamp_running += delta;
Stephane Eranian41587552011-01-03 18:20:01 +02001113 event->tstamp_stopped = tstamp;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001114 }
1115
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001116 if (event->state != PERF_EVENT_STATE_ACTIVE)
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001117 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001118
1119 event->state = PERF_EVENT_STATE_INACTIVE;
1120 if (event->pending_disable) {
1121 event->pending_disable = 0;
1122 event->state = PERF_EVENT_STATE_OFF;
1123 }
Stephane Eranian41587552011-01-03 18:20:01 +02001124 event->tstamp_stopped = tstamp;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001125 event->pmu->del(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001126 event->oncpu = -1;
1127
1128 if (!is_software_event(event))
1129 cpuctx->active_oncpu--;
1130 ctx->nr_active--;
1131 if (event->attr.exclusive || !cpuctx->active_oncpu)
1132 cpuctx->exclusive = 0;
1133}
1134
1135static void
1136group_sched_out(struct perf_event *group_event,
1137 struct perf_cpu_context *cpuctx,
1138 struct perf_event_context *ctx)
1139{
1140 struct perf_event *event;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001141 int state = group_event->state;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001142
1143 event_sched_out(group_event, cpuctx, ctx);
1144
1145 /*
1146 * Schedule out siblings (if any):
1147 */
1148 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1149 event_sched_out(event, cpuctx, ctx);
1150
Stephane Eranianfa66f072010-08-26 16:40:01 +02001151 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001152 cpuctx->exclusive = 0;
1153}
1154
1155/*
1156 * Cross CPU call to remove a performance event
1157 *
1158 * We disable the event on the hardware level first. After that we
1159 * remove it from the context list.
1160 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001161static int __perf_remove_from_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001162{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001163 struct perf_event *event = info;
1164 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001165 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001166
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001167 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001168 event_sched_out(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001169 list_del_event(event, ctx);
Peter Zijlstra64ce3122011-04-09 21:17:48 +02001170 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1171 ctx->is_active = 0;
1172 cpuctx->task_ctx = NULL;
1173 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001174 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001175
1176 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001177}
1178
1179
1180/*
1181 * Remove the event from a task's (or a CPU's) list of events.
1182 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001183 * CPU events are removed with a smp call. For task events we only
1184 * call when the task is on a CPU.
1185 *
1186 * If event->ctx is a cloned context, callers must make sure that
1187 * every task struct that event->ctx->task could possibly point to
1188 * remains valid. This is OK when called from perf_release since
1189 * that only calls us on the top-level context, which can't be a clone.
1190 * When called from perf_event_exit_task, it's OK because the
1191 * context has been detached from its task.
1192 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001193static void perf_remove_from_context(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001194{
1195 struct perf_event_context *ctx = event->ctx;
1196 struct task_struct *task = ctx->task;
1197
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001198 lockdep_assert_held(&ctx->mutex);
1199
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001200 if (!task) {
1201 /*
1202 * Per cpu events are removed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001203 * the removal is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001204 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001205 cpu_function_call(event->cpu, __perf_remove_from_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001206 return;
1207 }
1208
1209retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001210 if (!task_function_call(task, __perf_remove_from_context, event))
1211 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001212
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001213 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001214 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001215 * If we failed to find a running task, but find the context active now
1216 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001217 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001218 if (ctx->is_active) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001219 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001220 goto retry;
1221 }
1222
1223 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001224 * Since the task isn't running, its safe to remove the event, us
1225 * holding the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001226 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001227 list_del_event(event, ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001228 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001229}
1230
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001231/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001232 * Cross CPU call to disable a performance event
1233 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001234static int __perf_event_disable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001235{
1236 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001237 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001238 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001239
1240 /*
1241 * If this is a per-task event, need to check whether this
1242 * event's task is the current task on this cpu.
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001243 *
1244 * Can trigger due to concurrent perf_event_context_sched_out()
1245 * flipping contexts around.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001246 */
1247 if (ctx->task && cpuctx->task_ctx != ctx)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001248 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001249
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001250 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001251
1252 /*
1253 * If the event is on, turn it off.
1254 * If it is in error state, leave it in error state.
1255 */
1256 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1257 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001258 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001259 update_group_times(event);
1260 if (event == event->group_leader)
1261 group_sched_out(event, cpuctx, ctx);
1262 else
1263 event_sched_out(event, cpuctx, ctx);
1264 event->state = PERF_EVENT_STATE_OFF;
1265 }
1266
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001267 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001268
1269 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001270}
1271
1272/*
1273 * Disable a event.
1274 *
1275 * If event->ctx is a cloned context, callers must make sure that
1276 * every task struct that event->ctx->task could possibly point to
1277 * remains valid. This condition is satisifed when called through
1278 * perf_event_for_each_child or perf_event_for_each because they
1279 * hold the top-level event's child_mutex, so any descendant that
1280 * goes to exit will block in sync_child_event.
1281 * When called from perf_pending_event it's OK because event->ctx
1282 * is the current context on this CPU and preemption is disabled,
1283 * hence we can't get into perf_event_task_sched_out for this context.
1284 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001285void perf_event_disable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001286{
1287 struct perf_event_context *ctx = event->ctx;
1288 struct task_struct *task = ctx->task;
1289
1290 if (!task) {
1291 /*
1292 * Disable the event on the cpu that it's on
1293 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001294 cpu_function_call(event->cpu, __perf_event_disable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001295 return;
1296 }
1297
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001298retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001299 if (!task_function_call(task, __perf_event_disable, event))
1300 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001301
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001302 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001303 /*
1304 * If the event is still active, we need to retry the cross-call.
1305 */
1306 if (event->state == PERF_EVENT_STATE_ACTIVE) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001307 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001308 /*
1309 * Reload the task pointer, it might have been changed by
1310 * a concurrent perf_event_context_sched_out().
1311 */
1312 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001313 goto retry;
1314 }
1315
1316 /*
1317 * Since we have the lock this context can't be scheduled
1318 * in, so we can change the state safely.
1319 */
1320 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1321 update_group_times(event);
1322 event->state = PERF_EVENT_STATE_OFF;
1323 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001324 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001325}
1326
Stephane Eraniane5d13672011-02-14 11:20:01 +02001327static void perf_set_shadow_time(struct perf_event *event,
1328 struct perf_event_context *ctx,
1329 u64 tstamp)
1330{
1331 /*
1332 * use the correct time source for the time snapshot
1333 *
1334 * We could get by without this by leveraging the
1335 * fact that to get to this function, the caller
1336 * has most likely already called update_context_time()
1337 * and update_cgrp_time_xx() and thus both timestamp
1338 * are identical (or very close). Given that tstamp is,
1339 * already adjusted for cgroup, we could say that:
1340 * tstamp - ctx->timestamp
1341 * is equivalent to
1342 * tstamp - cgrp->timestamp.
1343 *
1344 * Then, in perf_output_read(), the calculation would
1345 * work with no changes because:
1346 * - event is guaranteed scheduled in
1347 * - no scheduled out in between
1348 * - thus the timestamp would be the same
1349 *
1350 * But this is a bit hairy.
1351 *
1352 * So instead, we have an explicit cgroup call to remain
1353 * within the time time source all along. We believe it
1354 * is cleaner and simpler to understand.
1355 */
1356 if (is_cgroup_event(event))
1357 perf_cgroup_set_shadow_time(event, tstamp);
1358 else
1359 event->shadow_ctx_time = tstamp - ctx->timestamp;
1360}
1361
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001362#define MAX_INTERRUPTS (~0ULL)
1363
1364static void perf_log_throttle(struct perf_event *event, int enable);
1365
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001366static int
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001367event_sched_in(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001368 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001369 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001370{
Stephane Eranian41587552011-01-03 18:20:01 +02001371 u64 tstamp = perf_event_time(event);
1372
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001373 if (event->state <= PERF_EVENT_STATE_OFF)
1374 return 0;
1375
1376 event->state = PERF_EVENT_STATE_ACTIVE;
Peter Zijlstra6e377382010-02-11 13:21:58 +01001377 event->oncpu = smp_processor_id();
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001378
1379 /*
1380 * Unthrottle events, since we scheduled we might have missed several
1381 * ticks already, also for a heavily scheduling task there is little
1382 * guarantee it'll get a tick in a timely manner.
1383 */
1384 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1385 perf_log_throttle(event, 1);
1386 event->hw.interrupts = 0;
1387 }
1388
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001389 /*
1390 * The new state must be visible before we turn it on in the hardware:
1391 */
1392 smp_wmb();
1393
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001394 if (event->pmu->add(event, PERF_EF_START)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001395 event->state = PERF_EVENT_STATE_INACTIVE;
1396 event->oncpu = -1;
1397 return -EAGAIN;
1398 }
1399
Stephane Eranian41587552011-01-03 18:20:01 +02001400 event->tstamp_running += tstamp - event->tstamp_stopped;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001401
Stephane Eraniane5d13672011-02-14 11:20:01 +02001402 perf_set_shadow_time(event, ctx, tstamp);
Stephane Eranianeed01522010-10-26 16:08:01 +02001403
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001404 if (!is_software_event(event))
1405 cpuctx->active_oncpu++;
1406 ctx->nr_active++;
1407
1408 if (event->attr.exclusive)
1409 cpuctx->exclusive = 1;
1410
1411 return 0;
1412}
1413
1414static int
1415group_sched_in(struct perf_event *group_event,
1416 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001417 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001418{
Lin Ming6bde9b62010-04-23 13:56:00 +08001419 struct perf_event *event, *partial_group = NULL;
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02001420 struct pmu *pmu = group_event->pmu;
Stephane Eraniand7842da2010-10-20 15:25:01 +02001421 u64 now = ctx->time;
1422 bool simulate = false;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001423
1424 if (group_event->state == PERF_EVENT_STATE_OFF)
1425 return 0;
1426
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001427 pmu->start_txn(pmu);
Lin Ming6bde9b62010-04-23 13:56:00 +08001428
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001429 if (event_sched_in(group_event, cpuctx, ctx)) {
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001430 pmu->cancel_txn(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001431 return -EAGAIN;
Stephane Eranian90151c32010-05-25 16:23:10 +02001432 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001433
1434 /*
1435 * Schedule in siblings as one group (if any):
1436 */
1437 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001438 if (event_sched_in(event, cpuctx, ctx)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001439 partial_group = event;
1440 goto group_error;
1441 }
1442 }
1443
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001444 if (!pmu->commit_txn(pmu))
Paul Mackerras6e851582010-05-08 20:58:00 +10001445 return 0;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001446
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001447group_error:
1448 /*
1449 * Groups can be scheduled in as one unit only, so undo any
1450 * partial group before returning:
Stephane Eraniand7842da2010-10-20 15:25:01 +02001451 * The events up to the failed event are scheduled out normally,
1452 * tstamp_stopped will be updated.
1453 *
1454 * The failed events and the remaining siblings need to have
1455 * their timings updated as if they had gone thru event_sched_in()
1456 * and event_sched_out(). This is required to get consistent timings
1457 * across the group. This also takes care of the case where the group
1458 * could never be scheduled by ensuring tstamp_stopped is set to mark
1459 * the time the event was actually stopped, such that time delta
1460 * calculation in update_event_times() is correct.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001461 */
1462 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1463 if (event == partial_group)
Stephane Eraniand7842da2010-10-20 15:25:01 +02001464 simulate = true;
1465
1466 if (simulate) {
1467 event->tstamp_running += now - event->tstamp_stopped;
1468 event->tstamp_stopped = now;
1469 } else {
1470 event_sched_out(event, cpuctx, ctx);
1471 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001472 }
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001473 event_sched_out(group_event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001474
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001475 pmu->cancel_txn(pmu);
Stephane Eranian90151c32010-05-25 16:23:10 +02001476
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001477 return -EAGAIN;
1478}
1479
1480/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001481 * Work out whether we can put this event group on the CPU now.
1482 */
1483static int group_can_go_on(struct perf_event *event,
1484 struct perf_cpu_context *cpuctx,
1485 int can_add_hw)
1486{
1487 /*
1488 * Groups consisting entirely of software events can always go on.
1489 */
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001490 if (event->group_flags & PERF_GROUP_SOFTWARE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001491 return 1;
1492 /*
1493 * If an exclusive group is already on, no other hardware
1494 * events can go on.
1495 */
1496 if (cpuctx->exclusive)
1497 return 0;
1498 /*
1499 * If this group is exclusive and there are already
1500 * events on the CPU, it can't go on.
1501 */
1502 if (event->attr.exclusive && cpuctx->active_oncpu)
1503 return 0;
1504 /*
1505 * Otherwise, try to add it if all previous groups were able
1506 * to go on.
1507 */
1508 return can_add_hw;
1509}
1510
1511static void add_event_to_ctx(struct perf_event *event,
1512 struct perf_event_context *ctx)
1513{
Stephane Eranian41587552011-01-03 18:20:01 +02001514 u64 tstamp = perf_event_time(event);
1515
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001516 list_add_event(event, ctx);
Peter Zijlstra8a495422010-05-27 15:47:49 +02001517 perf_group_attach(event);
Stephane Eranian41587552011-01-03 18:20:01 +02001518 event->tstamp_enabled = tstamp;
1519 event->tstamp_running = tstamp;
1520 event->tstamp_stopped = tstamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001521}
1522
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001523static void task_ctx_sched_out(struct perf_event_context *ctx);
1524static void
1525ctx_sched_in(struct perf_event_context *ctx,
1526 struct perf_cpu_context *cpuctx,
1527 enum event_type_t event_type,
1528 struct task_struct *task);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001529
Peter Zijlstradce58552011-04-09 21:17:46 +02001530static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1531 struct perf_event_context *ctx,
1532 struct task_struct *task)
1533{
1534 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1535 if (ctx)
1536 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1537 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1538 if (ctx)
1539 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1540}
1541
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001542/*
1543 * Cross CPU call to install and enable a performance event
1544 *
1545 * Must be called with ctx->mutex held
1546 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001547static int __perf_install_in_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001548{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001549 struct perf_event *event = info;
1550 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001551 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001552 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1553 struct task_struct *task = current;
1554
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02001555 perf_ctx_lock(cpuctx, task_ctx);
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001556 perf_pmu_disable(cpuctx->ctx.pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001557
1558 /*
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001559 * If there was an active task_ctx schedule it out.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001560 */
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02001561 if (task_ctx)
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001562 task_ctx_sched_out(task_ctx);
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02001563
1564 /*
1565 * If the context we're installing events in is not the
1566 * active task_ctx, flip them.
1567 */
1568 if (ctx->task && task_ctx != ctx) {
1569 if (task_ctx)
1570 raw_spin_unlock(&task_ctx->lock);
1571 raw_spin_lock(&ctx->lock);
1572 task_ctx = ctx;
1573 }
1574
1575 if (task_ctx) {
1576 cpuctx->task_ctx = task_ctx;
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001577 task = task_ctx->task;
1578 }
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02001579
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001580 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001581
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001582 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001583 /*
1584 * update cgrp time only if current cgrp
1585 * matches event->cgrp. Must be done before
1586 * calling add_event_to_ctx()
1587 */
1588 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001589
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001590 add_event_to_ctx(event, ctx);
1591
1592 /*
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001593 * Schedule everything back in
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001594 */
Peter Zijlstradce58552011-04-09 21:17:46 +02001595 perf_event_sched_in(cpuctx, task_ctx, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001596
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02001597 perf_pmu_enable(cpuctx->ctx.pmu);
1598 perf_ctx_unlock(cpuctx, task_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001599
1600 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001601}
1602
1603/*
1604 * Attach a performance event to a context
1605 *
1606 * First we add the event to the list with the hardware enable bit
1607 * in event->hw_config cleared.
1608 *
1609 * If the event is attached to a task which is on a CPU we use a smp
1610 * call to enable it in the task context. The task might have been
1611 * scheduled away, but we check this in the smp call again.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001612 */
1613static void
1614perf_install_in_context(struct perf_event_context *ctx,
1615 struct perf_event *event,
1616 int cpu)
1617{
1618 struct task_struct *task = ctx->task;
1619
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001620 lockdep_assert_held(&ctx->mutex);
1621
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02001622 event->ctx = ctx;
1623
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001624 if (!task) {
1625 /*
1626 * Per cpu events are installed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001627 * the install is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001628 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001629 cpu_function_call(cpu, __perf_install_in_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001630 return;
1631 }
1632
1633retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001634 if (!task_function_call(task, __perf_install_in_context, event))
1635 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001636
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001637 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001638 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001639 * If we failed to find a running task, but find the context active now
1640 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001641 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001642 if (ctx->is_active) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001643 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001644 goto retry;
1645 }
1646
1647 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001648 * Since the task isn't running, its safe to add the event, us holding
1649 * the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001650 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001651 add_event_to_ctx(event, ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001652 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001653}
1654
1655/*
1656 * Put a event into inactive state and update time fields.
1657 * Enabling the leader of a group effectively enables all
1658 * the group members that aren't explicitly disabled, so we
1659 * have to update their ->tstamp_enabled also.
1660 * Note: this works for group members as well as group leaders
1661 * since the non-leader members' sibling_lists will be empty.
1662 */
1663static void __perf_event_mark_enabled(struct perf_event *event,
1664 struct perf_event_context *ctx)
1665{
1666 struct perf_event *sub;
Stephane Eranian41587552011-01-03 18:20:01 +02001667 u64 tstamp = perf_event_time(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001668
1669 event->state = PERF_EVENT_STATE_INACTIVE;
Stephane Eranian41587552011-01-03 18:20:01 +02001670 event->tstamp_enabled = tstamp - event->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001671 list_for_each_entry(sub, &event->sibling_list, group_entry) {
Stephane Eranian41587552011-01-03 18:20:01 +02001672 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1673 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001674 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001675}
1676
1677/*
1678 * Cross CPU call to enable a performance event
1679 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001680static int __perf_event_enable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001681{
1682 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001683 struct perf_event_context *ctx = event->ctx;
1684 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001685 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001686 int err;
1687
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001688 if (WARN_ON_ONCE(!ctx->is_active))
1689 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001690
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001691 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001692 update_context_time(ctx);
1693
1694 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1695 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001696
1697 /*
1698 * set current task's cgroup time reference point
1699 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +02001700 perf_cgroup_set_timestamp(current, ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001701
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001702 __perf_event_mark_enabled(event, ctx);
1703
Stephane Eraniane5d13672011-02-14 11:20:01 +02001704 if (!event_filter_match(event)) {
1705 if (is_cgroup_event(event))
1706 perf_cgroup_defer_enabled(event);
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001707 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001708 }
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001709
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001710 /*
1711 * If the event is in a group and isn't the group leader,
1712 * then don't put it on unless the group is on.
1713 */
1714 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1715 goto unlock;
1716
1717 if (!group_can_go_on(event, cpuctx, 1)) {
1718 err = -EEXIST;
1719 } else {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001720 if (event == leader)
Peter Zijlstra6e377382010-02-11 13:21:58 +01001721 err = group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001722 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001723 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001724 }
1725
1726 if (err) {
1727 /*
1728 * If this event can't go on and it's part of a
1729 * group, then the whole group has to come off.
1730 */
1731 if (leader != event)
1732 group_sched_out(leader, cpuctx, ctx);
1733 if (leader->attr.pinned) {
1734 update_group_times(leader);
1735 leader->state = PERF_EVENT_STATE_ERROR;
1736 }
1737 }
1738
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001739unlock:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001740 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001741
1742 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001743}
1744
1745/*
1746 * Enable a event.
1747 *
1748 * If event->ctx is a cloned context, callers must make sure that
1749 * every task struct that event->ctx->task could possibly point to
1750 * remains valid. This condition is satisfied when called through
1751 * perf_event_for_each_child or perf_event_for_each as described
1752 * for perf_event_disable.
1753 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001754void perf_event_enable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001755{
1756 struct perf_event_context *ctx = event->ctx;
1757 struct task_struct *task = ctx->task;
1758
1759 if (!task) {
1760 /*
1761 * Enable the event on the cpu that it's on
1762 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001763 cpu_function_call(event->cpu, __perf_event_enable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001764 return;
1765 }
1766
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001767 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001768 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1769 goto out;
1770
1771 /*
1772 * If the event is in error state, clear that first.
1773 * That way, if we see the event in error state below, we
1774 * know that it has gone back into error state, as distinct
1775 * from the task having been scheduled away before the
1776 * cross-call arrived.
1777 */
1778 if (event->state == PERF_EVENT_STATE_ERROR)
1779 event->state = PERF_EVENT_STATE_OFF;
1780
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001781retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001782 if (!ctx->is_active) {
1783 __perf_event_mark_enabled(event, ctx);
1784 goto out;
1785 }
1786
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001787 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001788
1789 if (!task_function_call(task, __perf_event_enable, event))
1790 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001791
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001792 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001793
1794 /*
1795 * If the context is active and the event is still off,
1796 * we need to retry the cross-call.
1797 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001798 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1799 /*
1800 * task could have been flipped by a concurrent
1801 * perf_event_context_sched_out()
1802 */
1803 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001804 goto retry;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001805 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001806
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001807out:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001808 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001809}
1810
Avi Kivity26ca5c12011-06-29 18:42:37 +03001811int perf_event_refresh(struct perf_event *event, int refresh)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001812{
1813 /*
1814 * not supported on inherited events
1815 */
Franck Bui-Huu2e939d12010-11-23 16:21:44 +01001816 if (event->attr.inherit || !is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001817 return -EINVAL;
1818
1819 atomic_add(refresh, &event->event_limit);
1820 perf_event_enable(event);
1821
1822 return 0;
1823}
Avi Kivity26ca5c12011-06-29 18:42:37 +03001824EXPORT_SYMBOL_GPL(perf_event_refresh);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001825
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001826static void ctx_sched_out(struct perf_event_context *ctx,
1827 struct perf_cpu_context *cpuctx,
1828 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001829{
1830 struct perf_event *event;
Peter Zijlstradb24d332011-04-09 21:17:45 +02001831 int is_active = ctx->is_active;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001832
Peter Zijlstradb24d332011-04-09 21:17:45 +02001833 ctx->is_active &= ~event_type;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001834 if (likely(!ctx->nr_events))
Peter Zijlstrafacc4302011-04-09 21:17:42 +02001835 return;
1836
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001837 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001838 update_cgrp_time_from_cpuctx(cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001839 if (!ctx->nr_active)
Peter Zijlstrafacc4302011-04-09 21:17:42 +02001840 return;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001841
Peter Zijlstra075e0b02011-04-09 21:17:40 +02001842 perf_pmu_disable(ctx->pmu);
Peter Zijlstradb24d332011-04-09 21:17:45 +02001843 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001844 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1845 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001846 }
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001847
Peter Zijlstradb24d332011-04-09 21:17:45 +02001848 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001849 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001850 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001851 }
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001852 perf_pmu_enable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001853}
1854
1855/*
1856 * Test whether two contexts are equivalent, i.e. whether they
1857 * have both been cloned from the same version of the same context
1858 * and they both have the same number of enabled events.
1859 * If the number of enabled events is the same, then the set
1860 * of enabled events should be the same, because these are both
1861 * inherited contexts, therefore we can't access individual events
1862 * in them directly with an fd; we can only enable/disable all
1863 * events via prctl, or enable/disable all events in a family
1864 * via ioctl, which will have the same effect on both contexts.
1865 */
1866static int context_equiv(struct perf_event_context *ctx1,
1867 struct perf_event_context *ctx2)
1868{
1869 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1870 && ctx1->parent_gen == ctx2->parent_gen
1871 && !ctx1->pin_count && !ctx2->pin_count;
1872}
1873
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001874static void __perf_event_sync_stat(struct perf_event *event,
1875 struct perf_event *next_event)
1876{
1877 u64 value;
1878
1879 if (!event->attr.inherit_stat)
1880 return;
1881
1882 /*
1883 * Update the event value, we cannot use perf_event_read()
1884 * because we're in the middle of a context switch and have IRQs
1885 * disabled, which upsets smp_call_function_single(), however
1886 * we know the event must be on the current CPU, therefore we
1887 * don't need to use it.
1888 */
1889 switch (event->state) {
1890 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001891 event->pmu->read(event);
1892 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001893
1894 case PERF_EVENT_STATE_INACTIVE:
1895 update_event_times(event);
1896 break;
1897
1898 default:
1899 break;
1900 }
1901
1902 /*
1903 * In order to keep per-task stats reliable we need to flip the event
1904 * values when we flip the contexts.
1905 */
Peter Zijlstrae7850592010-05-21 14:43:08 +02001906 value = local64_read(&next_event->count);
1907 value = local64_xchg(&event->count, value);
1908 local64_set(&next_event->count, value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001909
1910 swap(event->total_time_enabled, next_event->total_time_enabled);
1911 swap(event->total_time_running, next_event->total_time_running);
1912
1913 /*
1914 * Since we swizzled the values, update the user visible data too.
1915 */
1916 perf_event_update_userpage(event);
1917 perf_event_update_userpage(next_event);
1918}
1919
1920#define list_next_entry(pos, member) \
1921 list_entry(pos->member.next, typeof(*pos), member)
1922
1923static void perf_event_sync_stat(struct perf_event_context *ctx,
1924 struct perf_event_context *next_ctx)
1925{
1926 struct perf_event *event, *next_event;
1927
1928 if (!ctx->nr_stat)
1929 return;
1930
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001931 update_context_time(ctx);
1932
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001933 event = list_first_entry(&ctx->event_list,
1934 struct perf_event, event_entry);
1935
1936 next_event = list_first_entry(&next_ctx->event_list,
1937 struct perf_event, event_entry);
1938
1939 while (&event->event_entry != &ctx->event_list &&
1940 &next_event->event_entry != &next_ctx->event_list) {
1941
1942 __perf_event_sync_stat(event, next_event);
1943
1944 event = list_next_entry(event, event_entry);
1945 next_event = list_next_entry(next_event, event_entry);
1946 }
1947}
1948
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001949static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1950 struct task_struct *next)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001951{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001952 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001953 struct perf_event_context *next_ctx;
1954 struct perf_event_context *parent;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001955 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001956 int do_switch = 1;
1957
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001958 if (likely(!ctx))
1959 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001960
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001961 cpuctx = __get_cpu_context(ctx);
1962 if (!cpuctx->task_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001963 return;
1964
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001965 rcu_read_lock();
1966 parent = rcu_dereference(ctx->parent_ctx);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001967 next_ctx = next->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001968 if (parent && next_ctx &&
1969 rcu_dereference(next_ctx->parent_ctx) == parent) {
1970 /*
1971 * Looks like the two contexts are clones, so we might be
1972 * able to optimize the context switch. We lock both
1973 * contexts and check that they are clones under the
1974 * lock (including re-checking that neither has been
1975 * uncloned in the meantime). It doesn't matter which
1976 * order we take the locks because no other cpu could
1977 * be trying to lock both of these tasks.
1978 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001979 raw_spin_lock(&ctx->lock);
1980 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001981 if (context_equiv(ctx, next_ctx)) {
1982 /*
1983 * XXX do we need a memory barrier of sorts
1984 * wrt to rcu_dereference() of perf_event_ctxp
1985 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001986 task->perf_event_ctxp[ctxn] = next_ctx;
1987 next->perf_event_ctxp[ctxn] = ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001988 ctx->task = next;
1989 next_ctx->task = task;
1990 do_switch = 0;
1991
1992 perf_event_sync_stat(ctx, next_ctx);
1993 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001994 raw_spin_unlock(&next_ctx->lock);
1995 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001996 }
1997 rcu_read_unlock();
1998
1999 if (do_switch) {
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002000 raw_spin_lock(&ctx->lock);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002001 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002002 cpuctx->task_ctx = NULL;
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002003 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002004 }
2005}
2006
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002007#define for_each_task_context_nr(ctxn) \
2008 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2009
2010/*
2011 * Called from scheduler to remove the events of the current task,
2012 * with interrupts disabled.
2013 *
2014 * We stop each event and update the event value in event->count.
2015 *
2016 * This does not protect us against NMI, but disable()
2017 * sets the disabled bit in the control field of event _before_
2018 * accessing the event control register. If a NMI hits, then it will
2019 * not restart the event.
2020 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002021void __perf_event_task_sched_out(struct task_struct *task,
2022 struct task_struct *next)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002023{
2024 int ctxn;
2025
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002026 for_each_task_context_nr(ctxn)
2027 perf_event_context_sched_out(task, ctxn, next);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002028
2029 /*
2030 * if cgroup events exist on this CPU, then we need
2031 * to check if we have to switch out PMU state.
2032 * cgroup event are system-wide mode only
2033 */
2034 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002035 perf_cgroup_sched_out(task, next);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002036}
2037
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02002038static void task_ctx_sched_out(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002039{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002040 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002041
2042 if (!cpuctx->task_ctx)
2043 return;
2044
2045 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2046 return;
2047
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02002048 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002049 cpuctx->task_ctx = NULL;
2050}
2051
2052/*
2053 * Called with IRQs disabled
2054 */
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002055static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2056 enum event_type_t event_type)
2057{
2058 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002059}
2060
2061static void
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002062ctx_pinned_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002063 struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002064{
2065 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002066
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002067 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2068 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002069 continue;
Stephane Eranian5632ab12011-01-03 18:20:01 +02002070 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002071 continue;
2072
Stephane Eraniane5d13672011-02-14 11:20:01 +02002073 /* may need to reset tstamp_enabled */
2074 if (is_cgroup_event(event))
2075 perf_cgroup_mark_enabled(event, ctx);
2076
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002077 if (group_can_go_on(event, cpuctx, 1))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002078 group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002079
2080 /*
2081 * If this pinned group hasn't been scheduled,
2082 * put it in error state.
2083 */
2084 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2085 update_group_times(event);
2086 event->state = PERF_EVENT_STATE_ERROR;
2087 }
2088 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002089}
2090
2091static void
2092ctx_flexible_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002093 struct perf_cpu_context *cpuctx)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002094{
2095 struct perf_event *event;
2096 int can_add_hw = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002097
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002098 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2099 /* Ignore events in OFF or ERROR state */
2100 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002101 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002102 /*
2103 * Listen to the 'cpu' scheduling filter constraint
2104 * of events:
2105 */
Stephane Eranian5632ab12011-01-03 18:20:01 +02002106 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002107 continue;
2108
Stephane Eraniane5d13672011-02-14 11:20:01 +02002109 /* may need to reset tstamp_enabled */
2110 if (is_cgroup_event(event))
2111 perf_cgroup_mark_enabled(event, ctx);
2112
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002113 if (group_can_go_on(event, cpuctx, can_add_hw)) {
Peter Zijlstra6e377382010-02-11 13:21:58 +01002114 if (group_sched_in(event, cpuctx, ctx))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002115 can_add_hw = 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002116 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002117 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002118}
2119
2120static void
2121ctx_sched_in(struct perf_event_context *ctx,
2122 struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002123 enum event_type_t event_type,
2124 struct task_struct *task)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002125{
Stephane Eraniane5d13672011-02-14 11:20:01 +02002126 u64 now;
Peter Zijlstradb24d332011-04-09 21:17:45 +02002127 int is_active = ctx->is_active;
Stephane Eraniane5d13672011-02-14 11:20:01 +02002128
Peter Zijlstradb24d332011-04-09 21:17:45 +02002129 ctx->is_active |= event_type;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002130 if (likely(!ctx->nr_events))
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002131 return;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002132
Stephane Eraniane5d13672011-02-14 11:20:01 +02002133 now = perf_clock();
2134 ctx->timestamp = now;
Stephane Eranian3f7cce32011-02-18 14:40:01 +02002135 perf_cgroup_set_timestamp(task, ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002136 /*
2137 * First go through the list and put on any pinned groups
2138 * in order to give them the best chance of going on.
2139 */
Peter Zijlstradb24d332011-04-09 21:17:45 +02002140 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002141 ctx_pinned_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002142
2143 /* Then walk through the lower prio flexible groups */
Peter Zijlstradb24d332011-04-09 21:17:45 +02002144 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002145 ctx_flexible_sched_in(ctx, cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002146}
2147
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002148static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002149 enum event_type_t event_type,
2150 struct task_struct *task)
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002151{
2152 struct perf_event_context *ctx = &cpuctx->ctx;
2153
Stephane Eraniane5d13672011-02-14 11:20:01 +02002154 ctx_sched_in(ctx, cpuctx, event_type, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002155}
2156
Stephane Eraniane5d13672011-02-14 11:20:01 +02002157static void perf_event_context_sched_in(struct perf_event_context *ctx,
2158 struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002159{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002160 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002161
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002162 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002163 if (cpuctx->task_ctx == ctx)
2164 return;
2165
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002166 perf_ctx_lock(cpuctx, ctx);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002167 perf_pmu_disable(ctx->pmu);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002168 /*
2169 * We want to keep the following priority order:
2170 * cpu pinned (that don't need to move), task pinned,
2171 * cpu flexible, task flexible.
2172 */
2173 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2174
Peter Zijlstradce58552011-04-09 21:17:46 +02002175 perf_event_sched_in(cpuctx, ctx, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002176
2177 cpuctx->task_ctx = ctx;
eranian@google.com9b33fa62010-03-10 22:26:05 -08002178
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002179 perf_pmu_enable(ctx->pmu);
2180 perf_ctx_unlock(cpuctx, ctx);
2181
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002182 /*
2183 * Since these rotations are per-cpu, we need to ensure the
2184 * cpu-context we got scheduled on is actually rotating.
2185 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002186 perf_pmu_rotate_start(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002187}
2188
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002189/*
2190 * Called from scheduler to add the events of the current task
2191 * with interrupts disabled.
2192 *
2193 * We restore the event value and then enable it.
2194 *
2195 * This does not protect us against NMI, but enable()
2196 * sets the enabled bit in the control field of event _before_
2197 * accessing the event control register. If a NMI hits, then it will
2198 * keep the event running.
2199 */
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002200void __perf_event_task_sched_in(struct task_struct *prev,
2201 struct task_struct *task)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002202{
2203 struct perf_event_context *ctx;
2204 int ctxn;
2205
2206 for_each_task_context_nr(ctxn) {
2207 ctx = task->perf_event_ctxp[ctxn];
2208 if (likely(!ctx))
2209 continue;
2210
Stephane Eraniane5d13672011-02-14 11:20:01 +02002211 perf_event_context_sched_in(ctx, task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002212 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02002213 /*
2214 * if cgroup events exist on this CPU, then we need
2215 * to check if we have to switch in PMU state.
2216 * cgroup event are system-wide mode only
2217 */
2218 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002219 perf_cgroup_sched_in(prev, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002220}
2221
Peter Zijlstraabd50712010-01-26 18:50:16 +01002222static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2223{
2224 u64 frequency = event->attr.sample_freq;
2225 u64 sec = NSEC_PER_SEC;
2226 u64 divisor, dividend;
2227
2228 int count_fls, nsec_fls, frequency_fls, sec_fls;
2229
2230 count_fls = fls64(count);
2231 nsec_fls = fls64(nsec);
2232 frequency_fls = fls64(frequency);
2233 sec_fls = 30;
2234
2235 /*
2236 * We got @count in @nsec, with a target of sample_freq HZ
2237 * the target period becomes:
2238 *
2239 * @count * 10^9
2240 * period = -------------------
2241 * @nsec * sample_freq
2242 *
2243 */
2244
2245 /*
2246 * Reduce accuracy by one bit such that @a and @b converge
2247 * to a similar magnitude.
2248 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002249#define REDUCE_FLS(a, b) \
Peter Zijlstraabd50712010-01-26 18:50:16 +01002250do { \
2251 if (a##_fls > b##_fls) { \
2252 a >>= 1; \
2253 a##_fls--; \
2254 } else { \
2255 b >>= 1; \
2256 b##_fls--; \
2257 } \
2258} while (0)
2259
2260 /*
2261 * Reduce accuracy until either term fits in a u64, then proceed with
2262 * the other, so that finally we can do a u64/u64 division.
2263 */
2264 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2265 REDUCE_FLS(nsec, frequency);
2266 REDUCE_FLS(sec, count);
2267 }
2268
2269 if (count_fls + sec_fls > 64) {
2270 divisor = nsec * frequency;
2271
2272 while (count_fls + sec_fls > 64) {
2273 REDUCE_FLS(count, sec);
2274 divisor >>= 1;
2275 }
2276
2277 dividend = count * sec;
2278 } else {
2279 dividend = count * sec;
2280
2281 while (nsec_fls + frequency_fls > 64) {
2282 REDUCE_FLS(nsec, frequency);
2283 dividend >>= 1;
2284 }
2285
2286 divisor = nsec * frequency;
2287 }
2288
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002289 if (!divisor)
2290 return dividend;
2291
Peter Zijlstraabd50712010-01-26 18:50:16 +01002292 return div64_u64(dividend, divisor);
2293}
2294
2295static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002296{
2297 struct hw_perf_event *hwc = &event->hw;
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002298 s64 period, sample_period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002299 s64 delta;
2300
Peter Zijlstraabd50712010-01-26 18:50:16 +01002301 period = perf_calculate_period(event, nsec, count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002302
2303 delta = (s64)(period - hwc->sample_period);
2304 delta = (delta + 7) / 8; /* low pass filter */
2305
2306 sample_period = hwc->sample_period + delta;
2307
2308 if (!sample_period)
2309 sample_period = 1;
2310
2311 hwc->sample_period = sample_period;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002312
Peter Zijlstrae7850592010-05-21 14:43:08 +02002313 if (local64_read(&hwc->period_left) > 8*sample_period) {
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002314 event->pmu->stop(event, PERF_EF_UPDATE);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002315 local64_set(&hwc->period_left, 0);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002316 event->pmu->start(event, PERF_EF_RELOAD);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002317 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002318}
2319
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002320static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002321{
2322 struct perf_event *event;
2323 struct hw_perf_event *hwc;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002324 u64 interrupts, now;
2325 s64 delta;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002326
Paul Mackerras03541f82009-10-14 16:58:03 +11002327 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002328 if (event->state != PERF_EVENT_STATE_ACTIVE)
2329 continue;
2330
Stephane Eranian5632ab12011-01-03 18:20:01 +02002331 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01002332 continue;
2333
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002334 hwc = &event->hw;
2335
2336 interrupts = hwc->interrupts;
2337 hwc->interrupts = 0;
2338
2339 /*
2340 * unthrottle events on the tick
2341 */
2342 if (interrupts == MAX_INTERRUPTS) {
2343 perf_log_throttle(event, 1);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002344 event->pmu->start(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002345 }
2346
2347 if (!event->attr.freq || !event->attr.sample_freq)
2348 continue;
2349
Peter Zijlstraabd50712010-01-26 18:50:16 +01002350 event->pmu->read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002351 now = local64_read(&event->count);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002352 delta = now - hwc->freq_count_stamp;
2353 hwc->freq_count_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002354
Peter Zijlstraabd50712010-01-26 18:50:16 +01002355 if (delta > 0)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002356 perf_adjust_period(event, period, delta);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002357 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002358}
2359
2360/*
2361 * Round-robin a context's events:
2362 */
2363static void rotate_ctx(struct perf_event_context *ctx)
2364{
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01002365 /*
2366 * Rotate the first entry last of non-pinned groups. Rotation might be
2367 * disabled by the inheritance code.
2368 */
2369 if (!ctx->rotate_disable)
2370 list_rotate_left(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002371}
2372
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002373/*
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002374 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2375 * because they're strictly cpu affine and rotate_start is called with IRQs
2376 * disabled, while rotate_context is called from IRQ context.
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002377 */
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002378static void perf_rotate_context(struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002379{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002380 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002381 struct perf_event_context *ctx = NULL;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002382 int rotate = 0, remove = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002383
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002384 if (cpuctx->ctx.nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002385 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002386 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2387 rotate = 1;
2388 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002389
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002390 ctx = cpuctx->task_ctx;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002391 if (ctx && ctx->nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002392 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002393 if (ctx->nr_events != ctx->nr_active)
2394 rotate = 1;
2395 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002396
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002397 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002398 perf_pmu_disable(cpuctx->ctx.pmu);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002399 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002400 if (ctx)
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002401 perf_ctx_adjust_freq(ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002402
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002403 if (!rotate)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002404 goto done;
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002405
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002406 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002407 if (ctx)
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02002408 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002409
2410 rotate_ctx(&cpuctx->ctx);
2411 if (ctx)
2412 rotate_ctx(ctx);
2413
Peter Zijlstradce58552011-04-09 21:17:46 +02002414 perf_event_sched_in(cpuctx, ctx, current);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002415
2416done:
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002417 if (remove)
2418 list_del_init(&cpuctx->rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002419
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002420 perf_pmu_enable(cpuctx->ctx.pmu);
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002421 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002422}
2423
2424void perf_event_task_tick(void)
2425{
2426 struct list_head *head = &__get_cpu_var(rotation_list);
2427 struct perf_cpu_context *cpuctx, *tmp;
2428
2429 WARN_ON(!irqs_disabled());
2430
2431 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2432 if (cpuctx->jiffies_interval == 1 ||
2433 !(jiffies % cpuctx->jiffies_interval))
2434 perf_rotate_context(cpuctx);
2435 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002436}
2437
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002438static int event_enable_on_exec(struct perf_event *event,
2439 struct perf_event_context *ctx)
2440{
2441 if (!event->attr.enable_on_exec)
2442 return 0;
2443
2444 event->attr.enable_on_exec = 0;
2445 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2446 return 0;
2447
2448 __perf_event_mark_enabled(event, ctx);
2449
2450 return 1;
2451}
2452
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002453/*
2454 * Enable all of a task's events that have been marked enable-on-exec.
2455 * This expects task == current.
2456 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002457static void perf_event_enable_on_exec(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002458{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002459 struct perf_event *event;
2460 unsigned long flags;
2461 int enabled = 0;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002462 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002463
2464 local_irq_save(flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002465 if (!ctx || !ctx->nr_events)
2466 goto out;
2467
Stephane Eraniane566b762011-04-06 02:54:54 +02002468 /*
2469 * We must ctxsw out cgroup events to avoid conflict
2470 * when invoking perf_task_event_sched_in() later on
2471 * in this function. Otherwise we end up trying to
2472 * ctxswin cgroup events which are already scheduled
2473 * in.
2474 */
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002475 perf_cgroup_sched_out(current, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002476
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002477 raw_spin_lock(&ctx->lock);
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02002478 task_ctx_sched_out(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002479
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002480 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2481 ret = event_enable_on_exec(event, ctx);
2482 if (ret)
2483 enabled = 1;
2484 }
2485
2486 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2487 ret = event_enable_on_exec(event, ctx);
2488 if (ret)
2489 enabled = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002490 }
2491
2492 /*
2493 * Unclone this context if we enabled any event.
2494 */
2495 if (enabled)
2496 unclone_ctx(ctx);
2497
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002498 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002499
Stephane Eraniane566b762011-04-06 02:54:54 +02002500 /*
2501 * Also calls ctxswin for cgroup events, if any:
2502 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02002503 perf_event_context_sched_in(ctx, ctx->task);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002504out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002505 local_irq_restore(flags);
2506}
2507
2508/*
2509 * Cross CPU call to read the hardware event
2510 */
2511static void __perf_event_read(void *info)
2512{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002513 struct perf_event *event = info;
2514 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002515 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002516
2517 /*
2518 * If this is a task context, we need to check whether it is
2519 * the current task context of this cpu. If not it has been
2520 * scheduled out before the smp call arrived. In that case
2521 * event->count would have been updated to a recent sample
2522 * when the event was scheduled out.
2523 */
2524 if (ctx->task && cpuctx->task_ctx != ctx)
2525 return;
2526
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002527 raw_spin_lock(&ctx->lock);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002528 if (ctx->is_active) {
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002529 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002530 update_cgrp_time_from_event(event);
2531 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002532 update_event_times(event);
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002533 if (event->state == PERF_EVENT_STATE_ACTIVE)
2534 event->pmu->read(event);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002535 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002536}
2537
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002538static inline u64 perf_event_count(struct perf_event *event)
2539{
Peter Zijlstrae7850592010-05-21 14:43:08 +02002540 return local64_read(&event->count) + atomic64_read(&event->child_count);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002541}
2542
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002543static u64 perf_event_read(struct perf_event *event)
2544{
2545 /*
2546 * If event is enabled and currently active on a CPU, update the
2547 * value in the event structure:
2548 */
2549 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2550 smp_call_function_single(event->oncpu,
2551 __perf_event_read, event, 1);
2552 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01002553 struct perf_event_context *ctx = event->ctx;
2554 unsigned long flags;
2555
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002556 raw_spin_lock_irqsave(&ctx->lock, flags);
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002557 /*
2558 * may read while context is not active
2559 * (e.g., thread is blocked), in that case
2560 * we cannot update context time
2561 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02002562 if (ctx->is_active) {
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002563 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002564 update_cgrp_time_from_event(event);
2565 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002566 update_event_times(event);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002567 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002568 }
2569
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002570 return perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002571}
2572
2573/*
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002574 * Callchain support
2575 */
2576
2577struct callchain_cpus_entries {
2578 struct rcu_head rcu_head;
2579 struct perf_callchain_entry *cpu_entries[0];
2580};
2581
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002582static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002583static atomic_t nr_callchain_events;
2584static DEFINE_MUTEX(callchain_mutex);
2585struct callchain_cpus_entries *callchain_cpus_entries;
2586
2587
2588__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2589 struct pt_regs *regs)
2590{
2591}
2592
2593__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2594 struct pt_regs *regs)
2595{
2596}
2597
2598static void release_callchain_buffers_rcu(struct rcu_head *head)
2599{
2600 struct callchain_cpus_entries *entries;
2601 int cpu;
2602
2603 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2604
2605 for_each_possible_cpu(cpu)
2606 kfree(entries->cpu_entries[cpu]);
2607
2608 kfree(entries);
2609}
2610
2611static void release_callchain_buffers(void)
2612{
2613 struct callchain_cpus_entries *entries;
2614
2615 entries = callchain_cpus_entries;
2616 rcu_assign_pointer(callchain_cpus_entries, NULL);
2617 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2618}
2619
2620static int alloc_callchain_buffers(void)
2621{
2622 int cpu;
2623 int size;
2624 struct callchain_cpus_entries *entries;
2625
2626 /*
2627 * We can't use the percpu allocation API for data that can be
2628 * accessed from NMI. Use a temporary manual per cpu allocation
2629 * until that gets sorted out.
2630 */
Eric Dumazet88d4f0d2011-01-25 19:40:51 +01002631 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002632
2633 entries = kzalloc(size, GFP_KERNEL);
2634 if (!entries)
2635 return -ENOMEM;
2636
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002637 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002638
2639 for_each_possible_cpu(cpu) {
2640 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2641 cpu_to_node(cpu));
2642 if (!entries->cpu_entries[cpu])
2643 goto fail;
2644 }
2645
2646 rcu_assign_pointer(callchain_cpus_entries, entries);
2647
2648 return 0;
2649
2650fail:
2651 for_each_possible_cpu(cpu)
2652 kfree(entries->cpu_entries[cpu]);
2653 kfree(entries);
2654
2655 return -ENOMEM;
2656}
2657
2658static int get_callchain_buffers(void)
2659{
2660 int err = 0;
2661 int count;
2662
2663 mutex_lock(&callchain_mutex);
2664
2665 count = atomic_inc_return(&nr_callchain_events);
2666 if (WARN_ON_ONCE(count < 1)) {
2667 err = -EINVAL;
2668 goto exit;
2669 }
2670
2671 if (count > 1) {
2672 /* If the allocation failed, give up */
2673 if (!callchain_cpus_entries)
2674 err = -ENOMEM;
2675 goto exit;
2676 }
2677
2678 err = alloc_callchain_buffers();
2679 if (err)
2680 release_callchain_buffers();
2681exit:
2682 mutex_unlock(&callchain_mutex);
2683
2684 return err;
2685}
2686
2687static void put_callchain_buffers(void)
2688{
2689 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2690 release_callchain_buffers();
2691 mutex_unlock(&callchain_mutex);
2692 }
2693}
2694
2695static int get_recursion_context(int *recursion)
2696{
2697 int rctx;
2698
2699 if (in_nmi())
2700 rctx = 3;
2701 else if (in_irq())
2702 rctx = 2;
2703 else if (in_softirq())
2704 rctx = 1;
2705 else
2706 rctx = 0;
2707
2708 if (recursion[rctx])
2709 return -1;
2710
2711 recursion[rctx]++;
2712 barrier();
2713
2714 return rctx;
2715}
2716
2717static inline void put_recursion_context(int *recursion, int rctx)
2718{
2719 barrier();
2720 recursion[rctx]--;
2721}
2722
2723static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2724{
2725 int cpu;
2726 struct callchain_cpus_entries *entries;
2727
2728 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2729 if (*rctx == -1)
2730 return NULL;
2731
2732 entries = rcu_dereference(callchain_cpus_entries);
2733 if (!entries)
2734 return NULL;
2735
2736 cpu = smp_processor_id();
2737
2738 return &entries->cpu_entries[cpu][*rctx];
2739}
2740
2741static void
2742put_callchain_entry(int rctx)
2743{
2744 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2745}
2746
2747static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2748{
2749 int rctx;
2750 struct perf_callchain_entry *entry;
2751
2752
2753 entry = get_callchain_entry(&rctx);
2754 if (rctx == -1)
2755 return NULL;
2756
2757 if (!entry)
2758 goto exit_put;
2759
2760 entry->nr = 0;
2761
2762 if (!user_mode(regs)) {
2763 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2764 perf_callchain_kernel(entry, regs);
2765 if (current->mm)
2766 regs = task_pt_regs(current);
2767 else
2768 regs = NULL;
2769 }
2770
2771 if (regs) {
2772 perf_callchain_store(entry, PERF_CONTEXT_USER);
2773 perf_callchain_user(entry, regs);
2774 }
2775
2776exit_put:
2777 put_callchain_entry(rctx);
2778
2779 return entry;
2780}
2781
2782/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002783 * Initialize the perf_event context in a task_struct:
2784 */
Peter Zijlstraeb184472010-09-07 15:55:13 +02002785static void __perf_event_init_context(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002786{
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002787 raw_spin_lock_init(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002788 mutex_init(&ctx->mutex);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002789 INIT_LIST_HEAD(&ctx->pinned_groups);
2790 INIT_LIST_HEAD(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002791 INIT_LIST_HEAD(&ctx->event_list);
2792 atomic_set(&ctx->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002793}
2794
Peter Zijlstraeb184472010-09-07 15:55:13 +02002795static struct perf_event_context *
2796alloc_perf_context(struct pmu *pmu, struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002797{
2798 struct perf_event_context *ctx;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002799
2800 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2801 if (!ctx)
2802 return NULL;
2803
2804 __perf_event_init_context(ctx);
2805 if (task) {
2806 ctx->task = task;
2807 get_task_struct(task);
2808 }
2809 ctx->pmu = pmu;
2810
2811 return ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002812}
2813
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002814static struct task_struct *
2815find_lively_task_by_vpid(pid_t vpid)
2816{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002817 struct task_struct *task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002818 int err;
2819
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002820 rcu_read_lock();
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002821 if (!vpid)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002822 task = current;
2823 else
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002824 task = find_task_by_vpid(vpid);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002825 if (task)
2826 get_task_struct(task);
2827 rcu_read_unlock();
2828
2829 if (!task)
2830 return ERR_PTR(-ESRCH);
2831
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002832 /* Reuse ptrace permission checks for now. */
2833 err = -EACCES;
2834 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2835 goto errout;
2836
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002837 return task;
2838errout:
2839 put_task_struct(task);
2840 return ERR_PTR(err);
2841
2842}
2843
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002844/*
2845 * Returns a matching context with refcount and pincount.
2846 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002847static struct perf_event_context *
Matt Helsley38a81da2010-09-13 13:01:20 -07002848find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002849{
2850 struct perf_event_context *ctx;
2851 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002852 unsigned long flags;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002853 int ctxn, err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002854
Oleg Nesterov22a4ec72011-01-18 17:10:08 +01002855 if (!task) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002856 /* Must be root to operate on a CPU event: */
2857 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2858 return ERR_PTR(-EACCES);
2859
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002860 /*
2861 * We could be clever and allow to attach a event to an
2862 * offline CPU and activate it when the CPU comes up, but
2863 * that's for later.
2864 */
2865 if (!cpu_online(cpu))
2866 return ERR_PTR(-ENODEV);
2867
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002868 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002869 ctx = &cpuctx->ctx;
2870 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002871 ++ctx->pin_count;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002872
2873 return ctx;
2874 }
2875
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002876 err = -EINVAL;
2877 ctxn = pmu->task_ctx_nr;
2878 if (ctxn < 0)
2879 goto errout;
2880
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002881retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002882 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002883 if (ctx) {
2884 unclone_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002885 ++ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002886 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Peter Zijlstra9137fb22011-04-09 21:17:41 +02002887 } else {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002888 ctx = alloc_perf_context(pmu, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002889 err = -ENOMEM;
2890 if (!ctx)
2891 goto errout;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002892
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002893 err = 0;
2894 mutex_lock(&task->perf_event_mutex);
2895 /*
2896 * If it has already passed perf_event_exit_task().
2897 * we must see PF_EXITING, it takes this mutex too.
2898 */
2899 if (task->flags & PF_EXITING)
2900 err = -ESRCH;
2901 else if (task->perf_event_ctxp[ctxn])
2902 err = -EAGAIN;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002903 else {
Peter Zijlstra9137fb22011-04-09 21:17:41 +02002904 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002905 ++ctx->pin_count;
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002906 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002907 }
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002908 mutex_unlock(&task->perf_event_mutex);
2909
2910 if (unlikely(err)) {
Peter Zijlstra9137fb22011-04-09 21:17:41 +02002911 put_ctx(ctx);
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002912
2913 if (err == -EAGAIN)
2914 goto retry;
2915 goto errout;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002916 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002917 }
2918
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002919 return ctx;
2920
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002921errout:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002922 return ERR_PTR(err);
2923}
2924
Li Zefan6fb29152009-10-15 11:21:42 +08002925static void perf_event_free_filter(struct perf_event *event);
2926
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002927static void free_event_rcu(struct rcu_head *head)
2928{
2929 struct perf_event *event;
2930
2931 event = container_of(head, struct perf_event, rcu_head);
2932 if (event->ns)
2933 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08002934 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002935 kfree(event);
2936}
2937
Frederic Weisbecker76369132011-05-19 19:55:04 +02002938static void ring_buffer_put(struct ring_buffer *rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002939
2940static void free_event(struct perf_event *event)
2941{
Peter Zijlstrae360adb2010-10-14 14:01:34 +08002942 irq_work_sync(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002943
2944 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002945 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02002946 jump_label_dec(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01002947 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002948 atomic_dec(&nr_mmap_events);
2949 if (event->attr.comm)
2950 atomic_dec(&nr_comm_events);
2951 if (event->attr.task)
2952 atomic_dec(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002953 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2954 put_callchain_buffers();
Peter Zijlstra08309372011-03-03 11:31:20 +01002955 if (is_cgroup_event(event)) {
2956 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2957 jump_label_dec(&perf_sched_events);
2958 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002959 }
2960
Frederic Weisbecker76369132011-05-19 19:55:04 +02002961 if (event->rb) {
2962 ring_buffer_put(event->rb);
2963 event->rb = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002964 }
2965
Stephane Eraniane5d13672011-02-14 11:20:01 +02002966 if (is_cgroup_event(event))
2967 perf_detach_cgroup(event);
2968
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002969 if (event->destroy)
2970 event->destroy(event);
2971
Peter Zijlstra0c67b402010-09-13 11:15:58 +02002972 if (event->ctx)
2973 put_ctx(event->ctx);
2974
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002975 call_rcu(&event->rcu_head, free_event_rcu);
2976}
2977
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002978int perf_event_release_kernel(struct perf_event *event)
2979{
2980 struct perf_event_context *ctx = event->ctx;
2981
2982 WARN_ON_ONCE(ctx->parent_ctx);
Peter Zijlstraa0507c82010-05-06 15:42:53 +02002983 /*
2984 * There are two ways this annotation is useful:
2985 *
2986 * 1) there is a lock recursion from perf_event_exit_task
2987 * see the comment there.
2988 *
2989 * 2) there is a lock-inversion with mmap_sem through
2990 * perf_event_read_group(), which takes faults while
2991 * holding ctx->mutex, however this is called after
2992 * the last filedesc died, so there is no possibility
2993 * to trigger the AB-BA case.
2994 */
2995 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002996 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra8a495422010-05-27 15:47:49 +02002997 perf_group_detach(event);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002998 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrae03a9a52011-04-09 21:17:47 +02002999 perf_remove_from_context(event);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003000 mutex_unlock(&ctx->mutex);
3001
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003002 free_event(event);
3003
3004 return 0;
3005}
3006EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3007
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003008/*
3009 * Called when the last reference to the file is gone.
3010 */
3011static int perf_release(struct inode *inode, struct file *file)
3012{
3013 struct perf_event *event = file->private_data;
Peter Zijlstra88821352010-11-09 19:01:43 +01003014 struct task_struct *owner;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003015
3016 file->private_data = NULL;
3017
Peter Zijlstra88821352010-11-09 19:01:43 +01003018 rcu_read_lock();
3019 owner = ACCESS_ONCE(event->owner);
3020 /*
3021 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3022 * !owner it means the list deletion is complete and we can indeed
3023 * free this event, otherwise we need to serialize on
3024 * owner->perf_event_mutex.
3025 */
3026 smp_read_barrier_depends();
3027 if (owner) {
3028 /*
3029 * Since delayed_put_task_struct() also drops the last
3030 * task reference we can safely take a new reference
3031 * while holding the rcu_read_lock().
3032 */
3033 get_task_struct(owner);
3034 }
3035 rcu_read_unlock();
3036
3037 if (owner) {
3038 mutex_lock(&owner->perf_event_mutex);
3039 /*
3040 * We have to re-check the event->owner field, if it is cleared
3041 * we raced with perf_event_exit_task(), acquiring the mutex
3042 * ensured they're done, and we can proceed with freeing the
3043 * event.
3044 */
3045 if (event->owner)
3046 list_del_init(&event->owner_entry);
3047 mutex_unlock(&owner->perf_event_mutex);
3048 put_task_struct(owner);
3049 }
3050
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003051 return perf_event_release_kernel(event);
3052}
3053
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003054u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003055{
3056 struct perf_event *child;
3057 u64 total = 0;
3058
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003059 *enabled = 0;
3060 *running = 0;
3061
Peter Zijlstra6f105812009-11-20 22:19:56 +01003062 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003063 total += perf_event_read(event);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003064 *enabled += event->total_time_enabled +
3065 atomic64_read(&event->child_total_time_enabled);
3066 *running += event->total_time_running +
3067 atomic64_read(&event->child_total_time_running);
3068
3069 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003070 total += perf_event_read(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003071 *enabled += child->total_time_enabled;
3072 *running += child->total_time_running;
3073 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003074 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003075
3076 return total;
3077}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003078EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003079
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003080static int perf_event_read_group(struct perf_event *event,
3081 u64 read_format, char __user *buf)
3082{
3083 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01003084 int n = 0, size = 0, ret = -EFAULT;
3085 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003086 u64 values[5];
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003087 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003088
Peter Zijlstra6f105812009-11-20 22:19:56 +01003089 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003090 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003091
3092 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003093 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3094 values[n++] = enabled;
3095 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3096 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003097 values[n++] = count;
3098 if (read_format & PERF_FORMAT_ID)
3099 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003100
3101 size = n * sizeof(u64);
3102
3103 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01003104 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003105
Peter Zijlstra6f105812009-11-20 22:19:56 +01003106 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003107
3108 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01003109 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003110
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003111 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003112 if (read_format & PERF_FORMAT_ID)
3113 values[n++] = primary_event_id(sub);
3114
3115 size = n * sizeof(u64);
3116
Stephane Eranian184d3da2009-11-23 21:40:49 -08003117 if (copy_to_user(buf + ret, values, size)) {
Peter Zijlstra6f105812009-11-20 22:19:56 +01003118 ret = -EFAULT;
3119 goto unlock;
3120 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01003121
3122 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003123 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003124unlock:
3125 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003126
Peter Zijlstraabf48682009-11-20 22:19:49 +01003127 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003128}
3129
3130static int perf_event_read_one(struct perf_event *event,
3131 u64 read_format, char __user *buf)
3132{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003133 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003134 u64 values[4];
3135 int n = 0;
3136
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003137 values[n++] = perf_event_read_value(event, &enabled, &running);
3138 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3139 values[n++] = enabled;
3140 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3141 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003142 if (read_format & PERF_FORMAT_ID)
3143 values[n++] = primary_event_id(event);
3144
3145 if (copy_to_user(buf, values, n * sizeof(u64)))
3146 return -EFAULT;
3147
3148 return n * sizeof(u64);
3149}
3150
3151/*
3152 * Read the performance event - simple non blocking version for now
3153 */
3154static ssize_t
3155perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3156{
3157 u64 read_format = event->attr.read_format;
3158 int ret;
3159
3160 /*
3161 * Return end-of-file for a read on a event that is in
3162 * error state (i.e. because it was pinned but it couldn't be
3163 * scheduled on to the CPU at some point).
3164 */
3165 if (event->state == PERF_EVENT_STATE_ERROR)
3166 return 0;
3167
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02003168 if (count < event->read_size)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003169 return -ENOSPC;
3170
3171 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003172 if (read_format & PERF_FORMAT_GROUP)
3173 ret = perf_event_read_group(event, read_format, buf);
3174 else
3175 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003176
3177 return ret;
3178}
3179
3180static ssize_t
3181perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3182{
3183 struct perf_event *event = file->private_data;
3184
3185 return perf_read_hw(event, buf, count);
3186}
3187
3188static unsigned int perf_poll(struct file *file, poll_table *wait)
3189{
3190 struct perf_event *event = file->private_data;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003191 struct ring_buffer *rb;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003192 unsigned int events = POLL_HUP;
3193
3194 rcu_read_lock();
Frederic Weisbecker76369132011-05-19 19:55:04 +02003195 rb = rcu_dereference(event->rb);
3196 if (rb)
3197 events = atomic_xchg(&rb->poll, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003198 rcu_read_unlock();
3199
3200 poll_wait(file, &event->waitq, wait);
3201
3202 return events;
3203}
3204
3205static void perf_event_reset(struct perf_event *event)
3206{
3207 (void)perf_event_read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02003208 local64_set(&event->count, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003209 perf_event_update_userpage(event);
3210}
3211
3212/*
3213 * Holding the top-level event's child_mutex means that any
3214 * descendant process that has inherited this event will block
3215 * in sync_child_event if it goes to exit, thus satisfying the
3216 * task existence requirements of perf_event_enable/disable.
3217 */
3218static void perf_event_for_each_child(struct perf_event *event,
3219 void (*func)(struct perf_event *))
3220{
3221 struct perf_event *child;
3222
3223 WARN_ON_ONCE(event->ctx->parent_ctx);
3224 mutex_lock(&event->child_mutex);
3225 func(event);
3226 list_for_each_entry(child, &event->child_list, child_list)
3227 func(child);
3228 mutex_unlock(&event->child_mutex);
3229}
3230
3231static void perf_event_for_each(struct perf_event *event,
3232 void (*func)(struct perf_event *))
3233{
3234 struct perf_event_context *ctx = event->ctx;
3235 struct perf_event *sibling;
3236
3237 WARN_ON_ONCE(ctx->parent_ctx);
3238 mutex_lock(&ctx->mutex);
3239 event = event->group_leader;
3240
3241 perf_event_for_each_child(event, func);
3242 func(event);
3243 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3244 perf_event_for_each_child(event, func);
3245 mutex_unlock(&ctx->mutex);
3246}
3247
3248static int perf_event_period(struct perf_event *event, u64 __user *arg)
3249{
3250 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003251 int ret = 0;
3252 u64 value;
3253
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01003254 if (!is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003255 return -EINVAL;
3256
John Blackwoodad0cf342010-09-28 18:03:11 -04003257 if (copy_from_user(&value, arg, sizeof(value)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003258 return -EFAULT;
3259
3260 if (!value)
3261 return -EINVAL;
3262
Thomas Gleixnere625cce2009-11-17 18:02:06 +01003263 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003264 if (event->attr.freq) {
3265 if (value > sysctl_perf_event_sample_rate) {
3266 ret = -EINVAL;
3267 goto unlock;
3268 }
3269
3270 event->attr.sample_freq = value;
3271 } else {
3272 event->attr.sample_period = value;
3273 event->hw.sample_period = value;
3274 }
3275unlock:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01003276 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003277
3278 return ret;
3279}
3280
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003281static const struct file_operations perf_fops;
3282
3283static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3284{
3285 struct file *file;
3286
3287 file = fget_light(fd, fput_needed);
3288 if (!file)
3289 return ERR_PTR(-EBADF);
3290
3291 if (file->f_op != &perf_fops) {
3292 fput_light(file, *fput_needed);
3293 *fput_needed = 0;
3294 return ERR_PTR(-EBADF);
3295 }
3296
3297 return file->private_data;
3298}
3299
3300static int perf_event_set_output(struct perf_event *event,
3301 struct perf_event *output_event);
Li Zefan6fb29152009-10-15 11:21:42 +08003302static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003303
3304static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3305{
3306 struct perf_event *event = file->private_data;
3307 void (*func)(struct perf_event *);
3308 u32 flags = arg;
3309
3310 switch (cmd) {
3311 case PERF_EVENT_IOC_ENABLE:
3312 func = perf_event_enable;
3313 break;
3314 case PERF_EVENT_IOC_DISABLE:
3315 func = perf_event_disable;
3316 break;
3317 case PERF_EVENT_IOC_RESET:
3318 func = perf_event_reset;
3319 break;
3320
3321 case PERF_EVENT_IOC_REFRESH:
3322 return perf_event_refresh(event, arg);
3323
3324 case PERF_EVENT_IOC_PERIOD:
3325 return perf_event_period(event, (u64 __user *)arg);
3326
3327 case PERF_EVENT_IOC_SET_OUTPUT:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003328 {
3329 struct perf_event *output_event = NULL;
3330 int fput_needed = 0;
3331 int ret;
3332
3333 if (arg != -1) {
3334 output_event = perf_fget_light(arg, &fput_needed);
3335 if (IS_ERR(output_event))
3336 return PTR_ERR(output_event);
3337 }
3338
3339 ret = perf_event_set_output(event, output_event);
3340 if (output_event)
3341 fput_light(output_event->filp, fput_needed);
3342
3343 return ret;
3344 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003345
Li Zefan6fb29152009-10-15 11:21:42 +08003346 case PERF_EVENT_IOC_SET_FILTER:
3347 return perf_event_set_filter(event, (void __user *)arg);
3348
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003349 default:
3350 return -ENOTTY;
3351 }
3352
3353 if (flags & PERF_IOC_FLAG_GROUP)
3354 perf_event_for_each(event, func);
3355 else
3356 perf_event_for_each_child(event, func);
3357
3358 return 0;
3359}
3360
3361int perf_event_task_enable(void)
3362{
3363 struct perf_event *event;
3364
3365 mutex_lock(&current->perf_event_mutex);
3366 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3367 perf_event_for_each_child(event, perf_event_enable);
3368 mutex_unlock(&current->perf_event_mutex);
3369
3370 return 0;
3371}
3372
3373int perf_event_task_disable(void)
3374{
3375 struct perf_event *event;
3376
3377 mutex_lock(&current->perf_event_mutex);
3378 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3379 perf_event_for_each_child(event, perf_event_disable);
3380 mutex_unlock(&current->perf_event_mutex);
3381
3382 return 0;
3383}
3384
3385#ifndef PERF_EVENT_INDEX_OFFSET
3386# define PERF_EVENT_INDEX_OFFSET 0
3387#endif
3388
3389static int perf_event_index(struct perf_event *event)
3390{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02003391 if (event->hw.state & PERF_HES_STOPPED)
3392 return 0;
3393
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003394 if (event->state != PERF_EVENT_STATE_ACTIVE)
3395 return 0;
3396
3397 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3398}
3399
Eric B Munsonc4794292011-06-23 16:34:38 -04003400static void calc_timer_values(struct perf_event *event,
Eric B Munson7f310a52011-06-23 16:34:38 -04003401 u64 *enabled,
3402 u64 *running)
Eric B Munsonc4794292011-06-23 16:34:38 -04003403{
3404 u64 now, ctx_time;
3405
3406 now = perf_clock();
3407 ctx_time = event->shadow_ctx_time + now;
3408 *enabled = ctx_time - event->tstamp_enabled;
3409 *running = ctx_time - event->tstamp_running;
3410}
3411
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003412/*
3413 * Callers need to ensure there can be no nesting of this function, otherwise
3414 * the seqlock logic goes bad. We can not serialize this because the arch
3415 * code calls this from NMI context.
3416 */
3417void perf_event_update_userpage(struct perf_event *event)
3418{
3419 struct perf_event_mmap_page *userpg;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003420 struct ring_buffer *rb;
Eric B Munson0d641202011-06-24 12:26:26 -04003421 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003422
3423 rcu_read_lock();
Eric B Munson0d641202011-06-24 12:26:26 -04003424 /*
3425 * compute total_time_enabled, total_time_running
3426 * based on snapshot values taken when the event
3427 * was last scheduled in.
3428 *
3429 * we cannot simply called update_context_time()
3430 * because of locking issue as we can be called in
3431 * NMI context
3432 */
3433 calc_timer_values(event, &enabled, &running);
Frederic Weisbecker76369132011-05-19 19:55:04 +02003434 rb = rcu_dereference(event->rb);
3435 if (!rb)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003436 goto unlock;
3437
Frederic Weisbecker76369132011-05-19 19:55:04 +02003438 userpg = rb->user_page;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003439
3440 /*
3441 * Disable preemption so as to not let the corresponding user-space
3442 * spin too long if we get preempted.
3443 */
3444 preempt_disable();
3445 ++userpg->lock;
3446 barrier();
3447 userpg->index = perf_event_index(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003448 userpg->offset = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003449 if (event->state == PERF_EVENT_STATE_ACTIVE)
Peter Zijlstrae7850592010-05-21 14:43:08 +02003450 userpg->offset -= local64_read(&event->hw.prev_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003451
Eric B Munson0d641202011-06-24 12:26:26 -04003452 userpg->time_enabled = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003453 atomic64_read(&event->child_total_time_enabled);
3454
Eric B Munson0d641202011-06-24 12:26:26 -04003455 userpg->time_running = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003456 atomic64_read(&event->child_total_time_running);
3457
3458 barrier();
3459 ++userpg->lock;
3460 preempt_enable();
3461unlock:
3462 rcu_read_unlock();
3463}
3464
Peter Zijlstra906010b2009-09-21 16:08:49 +02003465static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3466{
3467 struct perf_event *event = vma->vm_file->private_data;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003468 struct ring_buffer *rb;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003469 int ret = VM_FAULT_SIGBUS;
3470
3471 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3472 if (vmf->pgoff == 0)
3473 ret = 0;
3474 return ret;
3475 }
3476
3477 rcu_read_lock();
Frederic Weisbecker76369132011-05-19 19:55:04 +02003478 rb = rcu_dereference(event->rb);
3479 if (!rb)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003480 goto unlock;
3481
3482 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3483 goto unlock;
3484
Frederic Weisbecker76369132011-05-19 19:55:04 +02003485 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003486 if (!vmf->page)
3487 goto unlock;
3488
3489 get_page(vmf->page);
3490 vmf->page->mapping = vma->vm_file->f_mapping;
3491 vmf->page->index = vmf->pgoff;
3492
3493 ret = 0;
3494unlock:
3495 rcu_read_unlock();
3496
3497 return ret;
3498}
3499
Frederic Weisbecker76369132011-05-19 19:55:04 +02003500static void rb_free_rcu(struct rcu_head *rcu_head)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003501{
Frederic Weisbecker76369132011-05-19 19:55:04 +02003502 struct ring_buffer *rb;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003503
Frederic Weisbecker76369132011-05-19 19:55:04 +02003504 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3505 rb_free(rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003506}
3507
Frederic Weisbecker76369132011-05-19 19:55:04 +02003508static struct ring_buffer *ring_buffer_get(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003509{
Frederic Weisbecker76369132011-05-19 19:55:04 +02003510 struct ring_buffer *rb;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003511
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003512 rcu_read_lock();
Frederic Weisbecker76369132011-05-19 19:55:04 +02003513 rb = rcu_dereference(event->rb);
3514 if (rb) {
3515 if (!atomic_inc_not_zero(&rb->refcount))
3516 rb = NULL;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003517 }
3518 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003519
Frederic Weisbecker76369132011-05-19 19:55:04 +02003520 return rb;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003521}
3522
Frederic Weisbecker76369132011-05-19 19:55:04 +02003523static void ring_buffer_put(struct ring_buffer *rb)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003524{
Frederic Weisbecker76369132011-05-19 19:55:04 +02003525 if (!atomic_dec_and_test(&rb->refcount))
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003526 return;
3527
Frederic Weisbecker76369132011-05-19 19:55:04 +02003528 call_rcu(&rb->rcu_head, rb_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003529}
3530
3531static void perf_mmap_open(struct vm_area_struct *vma)
3532{
3533 struct perf_event *event = vma->vm_file->private_data;
3534
3535 atomic_inc(&event->mmap_count);
3536}
3537
3538static void perf_mmap_close(struct vm_area_struct *vma)
3539{
3540 struct perf_event *event = vma->vm_file->private_data;
3541
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003542 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Frederic Weisbecker76369132011-05-19 19:55:04 +02003543 unsigned long size = perf_data_size(event->rb);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003544 struct user_struct *user = event->mmap_user;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003545 struct ring_buffer *rb = event->rb;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003546
Peter Zijlstra906010b2009-09-21 16:08:49 +02003547 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003548 vma->vm_mm->locked_vm -= event->mmap_locked;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003549 rcu_assign_pointer(event->rb, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003550 mutex_unlock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003551
Frederic Weisbecker76369132011-05-19 19:55:04 +02003552 ring_buffer_put(rb);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003553 free_uid(user);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003554 }
3555}
3556
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04003557static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003558 .open = perf_mmap_open,
3559 .close = perf_mmap_close,
3560 .fault = perf_mmap_fault,
3561 .page_mkwrite = perf_mmap_fault,
3562};
3563
3564static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3565{
3566 struct perf_event *event = file->private_data;
3567 unsigned long user_locked, user_lock_limit;
3568 struct user_struct *user = current_user();
3569 unsigned long locked, lock_limit;
Frederic Weisbecker76369132011-05-19 19:55:04 +02003570 struct ring_buffer *rb;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003571 unsigned long vma_size;
3572 unsigned long nr_pages;
3573 long user_extra, extra;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003574 int ret = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003575
Peter Zijlstrac7920612010-05-18 10:33:24 +02003576 /*
3577 * Don't allow mmap() of inherited per-task counters. This would
3578 * create a performance issue due to all children writing to the
Frederic Weisbecker76369132011-05-19 19:55:04 +02003579 * same rb.
Peter Zijlstrac7920612010-05-18 10:33:24 +02003580 */
3581 if (event->cpu == -1 && event->attr.inherit)
3582 return -EINVAL;
3583
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003584 if (!(vma->vm_flags & VM_SHARED))
3585 return -EINVAL;
3586
3587 vma_size = vma->vm_end - vma->vm_start;
3588 nr_pages = (vma_size / PAGE_SIZE) - 1;
3589
3590 /*
Frederic Weisbecker76369132011-05-19 19:55:04 +02003591 * If we have rb pages ensure they're a power-of-two number, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003592 * can do bitmasks instead of modulo.
3593 */
3594 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3595 return -EINVAL;
3596
3597 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3598 return -EINVAL;
3599
3600 if (vma->vm_pgoff != 0)
3601 return -EINVAL;
3602
3603 WARN_ON_ONCE(event->ctx->parent_ctx);
3604 mutex_lock(&event->mmap_mutex);
Frederic Weisbecker76369132011-05-19 19:55:04 +02003605 if (event->rb) {
3606 if (event->rb->nr_pages == nr_pages)
3607 atomic_inc(&event->rb->refcount);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003608 else
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003609 ret = -EINVAL;
3610 goto unlock;
3611 }
3612
3613 user_extra = nr_pages + 1;
3614 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3615
3616 /*
3617 * Increase the limit linearly with more CPUs:
3618 */
3619 user_lock_limit *= num_online_cpus();
3620
3621 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3622
3623 extra = 0;
3624 if (user_locked > user_lock_limit)
3625 extra = user_locked - user_lock_limit;
3626
Jiri Slaby78d7d402010-03-05 13:42:54 -08003627 lock_limit = rlimit(RLIMIT_MEMLOCK);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003628 lock_limit >>= PAGE_SHIFT;
3629 locked = vma->vm_mm->locked_vm + extra;
3630
3631 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3632 !capable(CAP_IPC_LOCK)) {
3633 ret = -EPERM;
3634 goto unlock;
3635 }
3636
Frederic Weisbecker76369132011-05-19 19:55:04 +02003637 WARN_ON(event->rb);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003638
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003639 if (vma->vm_flags & VM_WRITE)
Frederic Weisbecker76369132011-05-19 19:55:04 +02003640 flags |= RING_BUFFER_WRITABLE;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003641
Vince Weaver4ec83632011-06-01 15:15:36 -04003642 rb = rb_alloc(nr_pages,
3643 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3644 event->cpu, flags);
3645
Frederic Weisbecker76369132011-05-19 19:55:04 +02003646 if (!rb) {
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003647 ret = -ENOMEM;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003648 goto unlock;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003649 }
Frederic Weisbecker76369132011-05-19 19:55:04 +02003650 rcu_assign_pointer(event->rb, rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003651
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003652 atomic_long_add(user_extra, &user->locked_vm);
3653 event->mmap_locked = extra;
3654 event->mmap_user = get_current_user();
3655 vma->vm_mm->locked_vm += event->mmap_locked;
3656
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003657unlock:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003658 if (!ret)
3659 atomic_inc(&event->mmap_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003660 mutex_unlock(&event->mmap_mutex);
3661
3662 vma->vm_flags |= VM_RESERVED;
3663 vma->vm_ops = &perf_mmap_vmops;
3664
3665 return ret;
3666}
3667
3668static int perf_fasync(int fd, struct file *filp, int on)
3669{
3670 struct inode *inode = filp->f_path.dentry->d_inode;
3671 struct perf_event *event = filp->private_data;
3672 int retval;
3673
3674 mutex_lock(&inode->i_mutex);
3675 retval = fasync_helper(fd, filp, on, &event->fasync);
3676 mutex_unlock(&inode->i_mutex);
3677
3678 if (retval < 0)
3679 return retval;
3680
3681 return 0;
3682}
3683
3684static const struct file_operations perf_fops = {
Arnd Bergmann3326c1c2010-03-23 19:09:33 +01003685 .llseek = no_llseek,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003686 .release = perf_release,
3687 .read = perf_read,
3688 .poll = perf_poll,
3689 .unlocked_ioctl = perf_ioctl,
3690 .compat_ioctl = perf_ioctl,
3691 .mmap = perf_mmap,
3692 .fasync = perf_fasync,
3693};
3694
3695/*
3696 * Perf event wakeup
3697 *
3698 * If there's data, ensure we set the poll() state and publish everything
3699 * to user-space before waking everybody up.
3700 */
3701
3702void perf_event_wakeup(struct perf_event *event)
3703{
3704 wake_up_all(&event->waitq);
3705
3706 if (event->pending_kill) {
3707 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3708 event->pending_kill = 0;
3709 }
3710}
3711
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003712static void perf_pending_event(struct irq_work *entry)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003713{
3714 struct perf_event *event = container_of(entry,
3715 struct perf_event, pending);
3716
3717 if (event->pending_disable) {
3718 event->pending_disable = 0;
3719 __perf_event_disable(event);
3720 }
3721
3722 if (event->pending_wakeup) {
3723 event->pending_wakeup = 0;
3724 perf_event_wakeup(event);
3725 }
3726}
3727
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003728/*
Zhang, Yanmin39447b32010-04-19 13:32:41 +08003729 * We assume there is only KVM supporting the callbacks.
3730 * Later on, we might change it to a list if there is
3731 * another virtualization implementation supporting the callbacks.
3732 */
3733struct perf_guest_info_callbacks *perf_guest_cbs;
3734
3735int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3736{
3737 perf_guest_cbs = cbs;
3738 return 0;
3739}
3740EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3741
3742int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3743{
3744 perf_guest_cbs = NULL;
3745 return 0;
3746}
3747EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3748
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003749static void __perf_event_header__init_id(struct perf_event_header *header,
3750 struct perf_sample_data *data,
3751 struct perf_event *event)
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02003752{
3753 u64 sample_type = event->attr.sample_type;
3754
3755 data->type = sample_type;
3756 header->size += event->id_header_size;
3757
3758 if (sample_type & PERF_SAMPLE_TID) {
3759 /* namespace issues */
3760 data->tid_entry.pid = perf_event_pid(event, current);
3761 data->tid_entry.tid = perf_event_tid(event, current);
3762 }
3763
3764 if (sample_type & PERF_SAMPLE_TIME)
3765 data->time = perf_clock();
3766
3767 if (sample_type & PERF_SAMPLE_ID)
3768 data->id = primary_event_id(event);
3769
3770 if (sample_type & PERF_SAMPLE_STREAM_ID)
3771 data->stream_id = event->id;
3772
3773 if (sample_type & PERF_SAMPLE_CPU) {
3774 data->cpu_entry.cpu = raw_smp_processor_id();
3775 data->cpu_entry.reserved = 0;
3776 }
3777}
3778
Frederic Weisbecker76369132011-05-19 19:55:04 +02003779void perf_event_header__init_id(struct perf_event_header *header,
3780 struct perf_sample_data *data,
3781 struct perf_event *event)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003782{
3783 if (event->attr.sample_id_all)
3784 __perf_event_header__init_id(header, data, event);
3785}
3786
3787static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3788 struct perf_sample_data *data)
3789{
3790 u64 sample_type = data->type;
3791
3792 if (sample_type & PERF_SAMPLE_TID)
3793 perf_output_put(handle, data->tid_entry);
3794
3795 if (sample_type & PERF_SAMPLE_TIME)
3796 perf_output_put(handle, data->time);
3797
3798 if (sample_type & PERF_SAMPLE_ID)
3799 perf_output_put(handle, data->id);
3800
3801 if (sample_type & PERF_SAMPLE_STREAM_ID)
3802 perf_output_put(handle, data->stream_id);
3803
3804 if (sample_type & PERF_SAMPLE_CPU)
3805 perf_output_put(handle, data->cpu_entry);
3806}
3807
Frederic Weisbecker76369132011-05-19 19:55:04 +02003808void perf_event__output_id_sample(struct perf_event *event,
3809 struct perf_output_handle *handle,
3810 struct perf_sample_data *sample)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003811{
3812 if (event->attr.sample_id_all)
3813 __perf_event__output_id_sample(handle, sample);
3814}
3815
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003816static void perf_output_read_one(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02003817 struct perf_event *event,
3818 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003819{
3820 u64 read_format = event->attr.read_format;
3821 u64 values[4];
3822 int n = 0;
3823
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003824 values[n++] = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003825 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
Stephane Eranianeed01522010-10-26 16:08:01 +02003826 values[n++] = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003827 atomic64_read(&event->child_total_time_enabled);
3828 }
3829 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
Stephane Eranianeed01522010-10-26 16:08:01 +02003830 values[n++] = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003831 atomic64_read(&event->child_total_time_running);
3832 }
3833 if (read_format & PERF_FORMAT_ID)
3834 values[n++] = primary_event_id(event);
3835
Frederic Weisbecker76369132011-05-19 19:55:04 +02003836 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003837}
3838
3839/*
3840 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3841 */
3842static void perf_output_read_group(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02003843 struct perf_event *event,
3844 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003845{
3846 struct perf_event *leader = event->group_leader, *sub;
3847 u64 read_format = event->attr.read_format;
3848 u64 values[5];
3849 int n = 0;
3850
3851 values[n++] = 1 + leader->nr_siblings;
3852
3853 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
Stephane Eranianeed01522010-10-26 16:08:01 +02003854 values[n++] = enabled;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003855
3856 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
Stephane Eranianeed01522010-10-26 16:08:01 +02003857 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003858
3859 if (leader != event)
3860 leader->pmu->read(leader);
3861
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003862 values[n++] = perf_event_count(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003863 if (read_format & PERF_FORMAT_ID)
3864 values[n++] = primary_event_id(leader);
3865
Frederic Weisbecker76369132011-05-19 19:55:04 +02003866 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003867
3868 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3869 n = 0;
3870
3871 if (sub != event)
3872 sub->pmu->read(sub);
3873
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003874 values[n++] = perf_event_count(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003875 if (read_format & PERF_FORMAT_ID)
3876 values[n++] = primary_event_id(sub);
3877
Frederic Weisbecker76369132011-05-19 19:55:04 +02003878 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003879 }
3880}
3881
Stephane Eranianeed01522010-10-26 16:08:01 +02003882#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3883 PERF_FORMAT_TOTAL_TIME_RUNNING)
3884
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003885static void perf_output_read(struct perf_output_handle *handle,
3886 struct perf_event *event)
3887{
Eric B Munsonc4794292011-06-23 16:34:38 -04003888 u64 enabled = 0, running = 0;
Stephane Eranianeed01522010-10-26 16:08:01 +02003889 u64 read_format = event->attr.read_format;
3890
3891 /*
3892 * compute total_time_enabled, total_time_running
3893 * based on snapshot values taken when the event
3894 * was last scheduled in.
3895 *
3896 * we cannot simply called update_context_time()
3897 * because of locking issue as we are called in
3898 * NMI context
3899 */
Eric B Munsonc4794292011-06-23 16:34:38 -04003900 if (read_format & PERF_FORMAT_TOTAL_TIMES)
3901 calc_timer_values(event, &enabled, &running);
Stephane Eranianeed01522010-10-26 16:08:01 +02003902
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003903 if (event->attr.read_format & PERF_FORMAT_GROUP)
Stephane Eranianeed01522010-10-26 16:08:01 +02003904 perf_output_read_group(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003905 else
Stephane Eranianeed01522010-10-26 16:08:01 +02003906 perf_output_read_one(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003907}
3908
3909void perf_output_sample(struct perf_output_handle *handle,
3910 struct perf_event_header *header,
3911 struct perf_sample_data *data,
3912 struct perf_event *event)
3913{
3914 u64 sample_type = data->type;
3915
3916 perf_output_put(handle, *header);
3917
3918 if (sample_type & PERF_SAMPLE_IP)
3919 perf_output_put(handle, data->ip);
3920
3921 if (sample_type & PERF_SAMPLE_TID)
3922 perf_output_put(handle, data->tid_entry);
3923
3924 if (sample_type & PERF_SAMPLE_TIME)
3925 perf_output_put(handle, data->time);
3926
3927 if (sample_type & PERF_SAMPLE_ADDR)
3928 perf_output_put(handle, data->addr);
3929
3930 if (sample_type & PERF_SAMPLE_ID)
3931 perf_output_put(handle, data->id);
3932
3933 if (sample_type & PERF_SAMPLE_STREAM_ID)
3934 perf_output_put(handle, data->stream_id);
3935
3936 if (sample_type & PERF_SAMPLE_CPU)
3937 perf_output_put(handle, data->cpu_entry);
3938
3939 if (sample_type & PERF_SAMPLE_PERIOD)
3940 perf_output_put(handle, data->period);
3941
3942 if (sample_type & PERF_SAMPLE_READ)
3943 perf_output_read(handle, event);
3944
3945 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3946 if (data->callchain) {
3947 int size = 1;
3948
3949 if (data->callchain)
3950 size += data->callchain->nr;
3951
3952 size *= sizeof(u64);
3953
Frederic Weisbecker76369132011-05-19 19:55:04 +02003954 __output_copy(handle, data->callchain, size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003955 } else {
3956 u64 nr = 0;
3957 perf_output_put(handle, nr);
3958 }
3959 }
3960
3961 if (sample_type & PERF_SAMPLE_RAW) {
3962 if (data->raw) {
3963 perf_output_put(handle, data->raw->size);
Frederic Weisbecker76369132011-05-19 19:55:04 +02003964 __output_copy(handle, data->raw->data,
3965 data->raw->size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003966 } else {
3967 struct {
3968 u32 size;
3969 u32 data;
3970 } raw = {
3971 .size = sizeof(u32),
3972 .data = 0,
3973 };
3974 perf_output_put(handle, raw);
3975 }
3976 }
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02003977
3978 if (!event->attr.watermark) {
3979 int wakeup_events = event->attr.wakeup_events;
3980
3981 if (wakeup_events) {
3982 struct ring_buffer *rb = handle->rb;
3983 int events = local_inc_return(&rb->events);
3984
3985 if (events >= wakeup_events) {
3986 local_sub(wakeup_events, &rb->events);
3987 local_inc(&rb->wakeup);
3988 }
3989 }
3990 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003991}
3992
3993void perf_prepare_sample(struct perf_event_header *header,
3994 struct perf_sample_data *data,
3995 struct perf_event *event,
3996 struct pt_regs *regs)
3997{
3998 u64 sample_type = event->attr.sample_type;
3999
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004000 header->type = PERF_RECORD_SAMPLE;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004001 header->size = sizeof(*header) + event->header_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004002
4003 header->misc = 0;
4004 header->misc |= perf_misc_flags(regs);
4005
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004006 __perf_event_header__init_id(header, data, event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02004007
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004008 if (sample_type & PERF_SAMPLE_IP)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004009 data->ip = perf_instruction_pointer(regs);
4010
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004011 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4012 int size = 1;
4013
4014 data->callchain = perf_callchain(regs);
4015
4016 if (data->callchain)
4017 size += data->callchain->nr;
4018
4019 header->size += size * sizeof(u64);
4020 }
4021
4022 if (sample_type & PERF_SAMPLE_RAW) {
4023 int size = sizeof(u32);
4024
4025 if (data->raw)
4026 size += data->raw->size;
4027 else
4028 size += sizeof(u32);
4029
4030 WARN_ON_ONCE(size & (sizeof(u64)-1));
4031 header->size += size;
4032 }
4033}
4034
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004035static void perf_event_output(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004036 struct perf_sample_data *data,
4037 struct pt_regs *regs)
4038{
4039 struct perf_output_handle handle;
4040 struct perf_event_header header;
4041
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004042 /* protect the callchain buffers */
4043 rcu_read_lock();
4044
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004045 perf_prepare_sample(&header, data, event, regs);
4046
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004047 if (perf_output_begin(&handle, event, header.size))
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004048 goto exit;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004049
4050 perf_output_sample(&handle, &header, data, event);
4051
4052 perf_output_end(&handle);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004053
4054exit:
4055 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004056}
4057
4058/*
4059 * read event_id
4060 */
4061
4062struct perf_read_event {
4063 struct perf_event_header header;
4064
4065 u32 pid;
4066 u32 tid;
4067};
4068
4069static void
4070perf_event_read_event(struct perf_event *event,
4071 struct task_struct *task)
4072{
4073 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004074 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004075 struct perf_read_event read_event = {
4076 .header = {
4077 .type = PERF_RECORD_READ,
4078 .misc = 0,
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004079 .size = sizeof(read_event) + event->read_size,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004080 },
4081 .pid = perf_event_pid(event, task),
4082 .tid = perf_event_tid(event, task),
4083 };
4084 int ret;
4085
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004086 perf_event_header__init_id(&read_event.header, &sample, event);
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004087 ret = perf_output_begin(&handle, event, read_event.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004088 if (ret)
4089 return;
4090
4091 perf_output_put(&handle, read_event);
4092 perf_output_read(&handle, event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004093 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004094
4095 perf_output_end(&handle);
4096}
4097
4098/*
4099 * task tracking -- fork/exit
4100 *
Eric B Munson3af9e852010-05-18 15:30:49 +01004101 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004102 */
4103
4104struct perf_task_event {
4105 struct task_struct *task;
4106 struct perf_event_context *task_ctx;
4107
4108 struct {
4109 struct perf_event_header header;
4110
4111 u32 pid;
4112 u32 ppid;
4113 u32 tid;
4114 u32 ptid;
4115 u64 time;
4116 } event_id;
4117};
4118
4119static void perf_event_task_output(struct perf_event *event,
4120 struct perf_task_event *task_event)
4121{
4122 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004123 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004124 struct task_struct *task = task_event->task;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004125 int ret, size = task_event->event_id.header.size;
Mike Galbraith8bb39f92010-03-26 11:11:33 +01004126
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004127 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004128
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004129 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004130 task_event->event_id.header.size);
Peter Zijlstraef607772010-05-18 10:50:41 +02004131 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004132 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004133
4134 task_event->event_id.pid = perf_event_pid(event, task);
4135 task_event->event_id.ppid = perf_event_pid(event, current);
4136
4137 task_event->event_id.tid = perf_event_tid(event, task);
4138 task_event->event_id.ptid = perf_event_tid(event, current);
4139
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004140 perf_output_put(&handle, task_event->event_id);
4141
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004142 perf_event__output_id_sample(event, &handle, &sample);
4143
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004144 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004145out:
4146 task_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004147}
4148
4149static int perf_event_task_match(struct perf_event *event)
4150{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004151 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004152 return 0;
4153
Stephane Eranian5632ab12011-01-03 18:20:01 +02004154 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004155 return 0;
4156
Eric B Munson3af9e852010-05-18 15:30:49 +01004157 if (event->attr.comm || event->attr.mmap ||
4158 event->attr.mmap_data || event->attr.task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004159 return 1;
4160
4161 return 0;
4162}
4163
4164static void perf_event_task_ctx(struct perf_event_context *ctx,
4165 struct perf_task_event *task_event)
4166{
4167 struct perf_event *event;
4168
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004169 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4170 if (perf_event_task_match(event))
4171 perf_event_task_output(event, task_event);
4172 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004173}
4174
4175static void perf_event_task_event(struct perf_task_event *task_event)
4176{
4177 struct perf_cpu_context *cpuctx;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004178 struct perf_event_context *ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004179 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004180 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004181
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01004182 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004183 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004184 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004185 if (cpuctx->active_pmu != pmu)
4186 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004187 perf_event_task_ctx(&cpuctx->ctx, task_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004188
4189 ctx = task_event->task_ctx;
4190 if (!ctx) {
4191 ctxn = pmu->task_ctx_nr;
4192 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004193 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004194 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4195 }
4196 if (ctx)
4197 perf_event_task_ctx(ctx, task_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004198next:
4199 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004200 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004201 rcu_read_unlock();
4202}
4203
4204static void perf_event_task(struct task_struct *task,
4205 struct perf_event_context *task_ctx,
4206 int new)
4207{
4208 struct perf_task_event task_event;
4209
4210 if (!atomic_read(&nr_comm_events) &&
4211 !atomic_read(&nr_mmap_events) &&
4212 !atomic_read(&nr_task_events))
4213 return;
4214
4215 task_event = (struct perf_task_event){
4216 .task = task,
4217 .task_ctx = task_ctx,
4218 .event_id = {
4219 .header = {
4220 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4221 .misc = 0,
4222 .size = sizeof(task_event.event_id),
4223 },
4224 /* .pid */
4225 /* .ppid */
4226 /* .tid */
4227 /* .ptid */
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004228 .time = perf_clock(),
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004229 },
4230 };
4231
4232 perf_event_task_event(&task_event);
4233}
4234
4235void perf_event_fork(struct task_struct *task)
4236{
4237 perf_event_task(task, NULL, 1);
4238}
4239
4240/*
4241 * comm tracking
4242 */
4243
4244struct perf_comm_event {
4245 struct task_struct *task;
4246 char *comm;
4247 int comm_size;
4248
4249 struct {
4250 struct perf_event_header header;
4251
4252 u32 pid;
4253 u32 tid;
4254 } event_id;
4255};
4256
4257static void perf_event_comm_output(struct perf_event *event,
4258 struct perf_comm_event *comm_event)
4259{
4260 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004261 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004262 int size = comm_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004263 int ret;
4264
4265 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4266 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004267 comm_event->event_id.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004268
4269 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004270 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004271
4272 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4273 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4274
4275 perf_output_put(&handle, comm_event->event_id);
Frederic Weisbecker76369132011-05-19 19:55:04 +02004276 __output_copy(&handle, comm_event->comm,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004277 comm_event->comm_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004278
4279 perf_event__output_id_sample(event, &handle, &sample);
4280
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004281 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004282out:
4283 comm_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004284}
4285
4286static int perf_event_comm_match(struct perf_event *event)
4287{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004288 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004289 return 0;
4290
Stephane Eranian5632ab12011-01-03 18:20:01 +02004291 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004292 return 0;
4293
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004294 if (event->attr.comm)
4295 return 1;
4296
4297 return 0;
4298}
4299
4300static void perf_event_comm_ctx(struct perf_event_context *ctx,
4301 struct perf_comm_event *comm_event)
4302{
4303 struct perf_event *event;
4304
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004305 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4306 if (perf_event_comm_match(event))
4307 perf_event_comm_output(event, comm_event);
4308 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004309}
4310
4311static void perf_event_comm_event(struct perf_comm_event *comm_event)
4312{
4313 struct perf_cpu_context *cpuctx;
4314 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004315 char comm[TASK_COMM_LEN];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004316 unsigned int size;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004317 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004318 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004319
4320 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01004321 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004322 size = ALIGN(strlen(comm)+1, sizeof(u64));
4323
4324 comm_event->comm = comm;
4325 comm_event->comm_size = size;
4326
4327 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
Peter Zijlstraf6595f32009-11-20 22:19:47 +01004328 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004329 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004330 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004331 if (cpuctx->active_pmu != pmu)
4332 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004333 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004334
4335 ctxn = pmu->task_ctx_nr;
4336 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004337 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004338
4339 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4340 if (ctx)
4341 perf_event_comm_ctx(ctx, comm_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004342next:
4343 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004344 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004345 rcu_read_unlock();
4346}
4347
4348void perf_event_comm(struct task_struct *task)
4349{
4350 struct perf_comm_event comm_event;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004351 struct perf_event_context *ctx;
4352 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004353
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004354 for_each_task_context_nr(ctxn) {
4355 ctx = task->perf_event_ctxp[ctxn];
4356 if (!ctx)
4357 continue;
4358
4359 perf_event_enable_on_exec(ctx);
4360 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004361
4362 if (!atomic_read(&nr_comm_events))
4363 return;
4364
4365 comm_event = (struct perf_comm_event){
4366 .task = task,
4367 /* .comm */
4368 /* .comm_size */
4369 .event_id = {
4370 .header = {
4371 .type = PERF_RECORD_COMM,
4372 .misc = 0,
4373 /* .size */
4374 },
4375 /* .pid */
4376 /* .tid */
4377 },
4378 };
4379
4380 perf_event_comm_event(&comm_event);
4381}
4382
4383/*
4384 * mmap tracking
4385 */
4386
4387struct perf_mmap_event {
4388 struct vm_area_struct *vma;
4389
4390 const char *file_name;
4391 int file_size;
4392
4393 struct {
4394 struct perf_event_header header;
4395
4396 u32 pid;
4397 u32 tid;
4398 u64 start;
4399 u64 len;
4400 u64 pgoff;
4401 } event_id;
4402};
4403
4404static void perf_event_mmap_output(struct perf_event *event,
4405 struct perf_mmap_event *mmap_event)
4406{
4407 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004408 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004409 int size = mmap_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004410 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004411
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004412 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4413 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004414 mmap_event->event_id.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004415 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004416 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004417
4418 mmap_event->event_id.pid = perf_event_pid(event, current);
4419 mmap_event->event_id.tid = perf_event_tid(event, current);
4420
4421 perf_output_put(&handle, mmap_event->event_id);
Frederic Weisbecker76369132011-05-19 19:55:04 +02004422 __output_copy(&handle, mmap_event->file_name,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004423 mmap_event->file_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004424
4425 perf_event__output_id_sample(event, &handle, &sample);
4426
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004427 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004428out:
4429 mmap_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004430}
4431
4432static int perf_event_mmap_match(struct perf_event *event,
Eric B Munson3af9e852010-05-18 15:30:49 +01004433 struct perf_mmap_event *mmap_event,
4434 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004435{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004436 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004437 return 0;
4438
Stephane Eranian5632ab12011-01-03 18:20:01 +02004439 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004440 return 0;
4441
Eric B Munson3af9e852010-05-18 15:30:49 +01004442 if ((!executable && event->attr.mmap_data) ||
4443 (executable && event->attr.mmap))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004444 return 1;
4445
4446 return 0;
4447}
4448
4449static void perf_event_mmap_ctx(struct perf_event_context *ctx,
Eric B Munson3af9e852010-05-18 15:30:49 +01004450 struct perf_mmap_event *mmap_event,
4451 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004452{
4453 struct perf_event *event;
4454
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004455 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Eric B Munson3af9e852010-05-18 15:30:49 +01004456 if (perf_event_mmap_match(event, mmap_event, executable))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004457 perf_event_mmap_output(event, mmap_event);
4458 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004459}
4460
4461static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4462{
4463 struct perf_cpu_context *cpuctx;
4464 struct perf_event_context *ctx;
4465 struct vm_area_struct *vma = mmap_event->vma;
4466 struct file *file = vma->vm_file;
4467 unsigned int size;
4468 char tmp[16];
4469 char *buf = NULL;
4470 const char *name;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004471 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004472 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004473
4474 memset(tmp, 0, sizeof(tmp));
4475
4476 if (file) {
4477 /*
Frederic Weisbecker76369132011-05-19 19:55:04 +02004478 * d_path works from the end of the rb backwards, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004479 * need to add enough zero bytes after the string to handle
4480 * the 64bit alignment we do later.
4481 */
4482 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4483 if (!buf) {
4484 name = strncpy(tmp, "//enomem", sizeof(tmp));
4485 goto got_name;
4486 }
4487 name = d_path(&file->f_path, buf, PATH_MAX);
4488 if (IS_ERR(name)) {
4489 name = strncpy(tmp, "//toolong", sizeof(tmp));
4490 goto got_name;
4491 }
4492 } else {
4493 if (arch_vma_name(mmap_event->vma)) {
4494 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4495 sizeof(tmp));
4496 goto got_name;
4497 }
4498
4499 if (!vma->vm_mm) {
4500 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4501 goto got_name;
Eric B Munson3af9e852010-05-18 15:30:49 +01004502 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4503 vma->vm_end >= vma->vm_mm->brk) {
4504 name = strncpy(tmp, "[heap]", sizeof(tmp));
4505 goto got_name;
4506 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4507 vma->vm_end >= vma->vm_mm->start_stack) {
4508 name = strncpy(tmp, "[stack]", sizeof(tmp));
4509 goto got_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004510 }
4511
4512 name = strncpy(tmp, "//anon", sizeof(tmp));
4513 goto got_name;
4514 }
4515
4516got_name:
4517 size = ALIGN(strlen(name)+1, sizeof(u64));
4518
4519 mmap_event->file_name = name;
4520 mmap_event->file_size = size;
4521
4522 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4523
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01004524 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004525 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004526 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004527 if (cpuctx->active_pmu != pmu)
4528 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004529 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4530 vma->vm_flags & VM_EXEC);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004531
4532 ctxn = pmu->task_ctx_nr;
4533 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004534 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004535
4536 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4537 if (ctx) {
4538 perf_event_mmap_ctx(ctx, mmap_event,
4539 vma->vm_flags & VM_EXEC);
4540 }
Peter Zijlstra41945f62010-09-16 19:17:24 +02004541next:
4542 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004543 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004544 rcu_read_unlock();
4545
4546 kfree(buf);
4547}
4548
Eric B Munson3af9e852010-05-18 15:30:49 +01004549void perf_event_mmap(struct vm_area_struct *vma)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004550{
4551 struct perf_mmap_event mmap_event;
4552
4553 if (!atomic_read(&nr_mmap_events))
4554 return;
4555
4556 mmap_event = (struct perf_mmap_event){
4557 .vma = vma,
4558 /* .file_name */
4559 /* .file_size */
4560 .event_id = {
4561 .header = {
4562 .type = PERF_RECORD_MMAP,
Zhang, Yanmin39447b32010-04-19 13:32:41 +08004563 .misc = PERF_RECORD_MISC_USER,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004564 /* .size */
4565 },
4566 /* .pid */
4567 /* .tid */
4568 .start = vma->vm_start,
4569 .len = vma->vm_end - vma->vm_start,
Peter Zijlstra3a0304e2010-02-26 10:33:41 +01004570 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004571 },
4572 };
4573
4574 perf_event_mmap_event(&mmap_event);
4575}
4576
4577/*
4578 * IRQ throttle logging
4579 */
4580
4581static void perf_log_throttle(struct perf_event *event, int enable)
4582{
4583 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004584 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004585 int ret;
4586
4587 struct {
4588 struct perf_event_header header;
4589 u64 time;
4590 u64 id;
4591 u64 stream_id;
4592 } throttle_event = {
4593 .header = {
4594 .type = PERF_RECORD_THROTTLE,
4595 .misc = 0,
4596 .size = sizeof(throttle_event),
4597 },
4598 .time = perf_clock(),
4599 .id = primary_event_id(event),
4600 .stream_id = event->id,
4601 };
4602
4603 if (enable)
4604 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4605
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004606 perf_event_header__init_id(&throttle_event.header, &sample, event);
4607
4608 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02004609 throttle_event.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004610 if (ret)
4611 return;
4612
4613 perf_output_put(&handle, throttle_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004614 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004615 perf_output_end(&handle);
4616}
4617
4618/*
4619 * Generic event overflow handling, sampling.
4620 */
4621
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004622static int __perf_event_overflow(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004623 int throttle, struct perf_sample_data *data,
4624 struct pt_regs *regs)
4625{
4626 int events = atomic_read(&event->event_limit);
4627 struct hw_perf_event *hwc = &event->hw;
4628 int ret = 0;
4629
Peter Zijlstra96398822010-11-24 18:55:29 +01004630 /*
4631 * Non-sampling counters might still use the PMI to fold short
4632 * hardware counters, ignore those.
4633 */
4634 if (unlikely(!is_sampling_event(event)))
4635 return 0;
4636
Peter Zijlstra163ec432011-02-16 11:22:34 +01004637 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4638 if (throttle) {
4639 hwc->interrupts = MAX_INTERRUPTS;
4640 perf_log_throttle(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004641 ret = 1;
4642 }
Peter Zijlstra163ec432011-02-16 11:22:34 +01004643 } else
4644 hwc->interrupts++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004645
4646 if (event->attr.freq) {
4647 u64 now = perf_clock();
Peter Zijlstraabd50712010-01-26 18:50:16 +01004648 s64 delta = now - hwc->freq_time_stamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004649
Peter Zijlstraabd50712010-01-26 18:50:16 +01004650 hwc->freq_time_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004651
Peter Zijlstraabd50712010-01-26 18:50:16 +01004652 if (delta > 0 && delta < 2*TICK_NSEC)
4653 perf_adjust_period(event, delta, hwc->last_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004654 }
4655
4656 /*
4657 * XXX event_limit might not quite work as expected on inherited
4658 * events
4659 */
4660
4661 event->pending_kill = POLL_IN;
4662 if (events && atomic_dec_and_test(&event->event_limit)) {
4663 ret = 1;
4664 event->pending_kill = POLL_HUP;
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004665 event->pending_disable = 1;
4666 irq_work_queue(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004667 }
4668
Peter Zijlstra453f19e2009-11-20 22:19:43 +01004669 if (event->overflow_handler)
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004670 event->overflow_handler(event, data, regs);
Peter Zijlstra453f19e2009-11-20 22:19:43 +01004671 else
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004672 perf_event_output(event, data, regs);
Peter Zijlstra453f19e2009-11-20 22:19:43 +01004673
Peter Zijlstraf506b3d2011-05-26 17:02:53 +02004674 if (event->fasync && event->pending_kill) {
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004675 event->pending_wakeup = 1;
4676 irq_work_queue(&event->pending);
Peter Zijlstraf506b3d2011-05-26 17:02:53 +02004677 }
4678
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004679 return ret;
4680}
4681
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004682int perf_event_overflow(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004683 struct perf_sample_data *data,
4684 struct pt_regs *regs)
4685{
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004686 return __perf_event_overflow(event, 1, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004687}
4688
4689/*
4690 * Generic software event infrastructure
4691 */
4692
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004693struct swevent_htable {
4694 struct swevent_hlist *swevent_hlist;
4695 struct mutex hlist_mutex;
4696 int hlist_refcount;
4697
4698 /* Recursion avoidance in each contexts */
4699 int recursion[PERF_NR_CONTEXTS];
4700};
4701
4702static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4703
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004704/*
4705 * We directly increment event->count and keep a second value in
4706 * event->hw.period_left to count intervals. This period event
4707 * is kept in the range [-sample_period, 0] so that we can use the
4708 * sign as trigger.
4709 */
4710
4711static u64 perf_swevent_set_period(struct perf_event *event)
4712{
4713 struct hw_perf_event *hwc = &event->hw;
4714 u64 period = hwc->last_period;
4715 u64 nr, offset;
4716 s64 old, val;
4717
4718 hwc->last_period = hwc->sample_period;
4719
4720again:
Peter Zijlstrae7850592010-05-21 14:43:08 +02004721 old = val = local64_read(&hwc->period_left);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004722 if (val < 0)
4723 return 0;
4724
4725 nr = div64_u64(period + val, period);
4726 offset = nr * period;
4727 val -= offset;
Peter Zijlstrae7850592010-05-21 14:43:08 +02004728 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004729 goto again;
4730
4731 return nr;
4732}
4733
Peter Zijlstra0cff7842009-11-20 22:19:44 +01004734static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004735 struct perf_sample_data *data,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004736 struct pt_regs *regs)
4737{
4738 struct hw_perf_event *hwc = &event->hw;
4739 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004740
4741 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01004742 if (!overflow)
4743 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004744
4745 if (hwc->interrupts == MAX_INTERRUPTS)
4746 return;
4747
4748 for (; overflow; overflow--) {
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004749 if (__perf_event_overflow(event, throttle,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004750 data, regs)) {
4751 /*
4752 * We inhibit the overflow from happening when
4753 * hwc->interrupts == MAX_INTERRUPTS.
4754 */
4755 break;
4756 }
4757 throttle = 1;
4758 }
4759}
4760
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004761static void perf_swevent_event(struct perf_event *event, u64 nr,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004762 struct perf_sample_data *data,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004763 struct pt_regs *regs)
4764{
4765 struct hw_perf_event *hwc = &event->hw;
4766
Peter Zijlstrae7850592010-05-21 14:43:08 +02004767 local64_add(nr, &event->count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004768
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004769 if (!regs)
4770 return;
4771
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01004772 if (!is_sampling_event(event))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01004773 return;
4774
4775 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004776 return perf_swevent_overflow(event, 1, data, regs);
Peter Zijlstra0cff7842009-11-20 22:19:44 +01004777
Peter Zijlstrae7850592010-05-21 14:43:08 +02004778 if (local64_add_negative(nr, &hwc->period_left))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01004779 return;
4780
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004781 perf_swevent_overflow(event, 0, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004782}
4783
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01004784static int perf_exclude_event(struct perf_event *event,
4785 struct pt_regs *regs)
4786{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004787 if (event->hw.state & PERF_HES_STOPPED)
Frederic Weisbecker91b2f482011-03-07 21:27:08 +01004788 return 1;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004789
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01004790 if (regs) {
4791 if (event->attr.exclude_user && user_mode(regs))
4792 return 1;
4793
4794 if (event->attr.exclude_kernel && !user_mode(regs))
4795 return 1;
4796 }
4797
4798 return 0;
4799}
4800
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004801static int perf_swevent_match(struct perf_event *event,
4802 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08004803 u32 event_id,
4804 struct perf_sample_data *data,
4805 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004806{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004807 if (event->attr.type != type)
4808 return 0;
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01004809
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004810 if (event->attr.config != event_id)
4811 return 0;
4812
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01004813 if (perf_exclude_event(event, regs))
4814 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004815
4816 return 1;
4817}
4818
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004819static inline u64 swevent_hash(u64 type, u32 event_id)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004820{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004821 u64 val = event_id | (type << 32);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004822
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004823 return hash_64(val, SWEVENT_HLIST_BITS);
4824}
4825
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004826static inline struct hlist_head *
4827__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004828{
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004829 u64 hash = swevent_hash(type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004830
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004831 return &hlist->heads[hash];
4832}
4833
4834/* For the read side: events when they trigger */
4835static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004836find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004837{
4838 struct swevent_hlist *hlist;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004839
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004840 hlist = rcu_dereference(swhash->swevent_hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004841 if (!hlist)
4842 return NULL;
4843
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004844 return __find_swevent_head(hlist, type, event_id);
4845}
4846
4847/* For the event head insertion and removal in the hlist */
4848static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004849find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004850{
4851 struct swevent_hlist *hlist;
4852 u32 event_id = event->attr.config;
4853 u64 type = event->attr.type;
4854
4855 /*
4856 * Event scheduling is always serialized against hlist allocation
4857 * and release. Which makes the protected version suitable here.
4858 * The context lock guarantees that.
4859 */
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004860 hlist = rcu_dereference_protected(swhash->swevent_hlist,
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004861 lockdep_is_held(&event->ctx->lock));
4862 if (!hlist)
4863 return NULL;
4864
4865 return __find_swevent_head(hlist, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004866}
4867
4868static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004869 u64 nr,
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004870 struct perf_sample_data *data,
4871 struct pt_regs *regs)
4872{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004873 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004874 struct perf_event *event;
4875 struct hlist_node *node;
4876 struct hlist_head *head;
4877
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004878 rcu_read_lock();
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004879 head = find_swevent_head_rcu(swhash, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004880 if (!head)
4881 goto end;
4882
4883 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08004884 if (perf_swevent_match(event, type, event_id, data, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004885 perf_swevent_event(event, nr, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004886 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004887end:
4888 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004889}
4890
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01004891int perf_swevent_get_recursion_context(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004892{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004893 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01004894
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004895 return get_recursion_context(swhash->recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004896}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01004897EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004898
Jesper Juhlfa9f90b2010-11-28 21:39:34 +01004899inline void perf_swevent_put_recursion_context(int rctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004900{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004901 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004902
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004903 put_recursion_context(swhash->recursion, rctx);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01004904}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004905
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004906void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004907{
Ingo Molnara4234bf2009-11-23 10:57:59 +01004908 struct perf_sample_data data;
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01004909 int rctx;
4910
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02004911 preempt_disable_notrace();
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01004912 rctx = perf_swevent_get_recursion_context();
4913 if (rctx < 0)
4914 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004915
Peter Zijlstradc1d6282010-03-03 15:55:04 +01004916 perf_sample_data_init(&data, addr);
Ingo Molnara4234bf2009-11-23 10:57:59 +01004917
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02004918 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01004919
4920 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02004921 preempt_enable_notrace();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004922}
4923
4924static void perf_swevent_read(struct perf_event *event)
4925{
4926}
4927
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004928static int perf_swevent_add(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004929{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004930 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004931 struct hw_perf_event *hwc = &event->hw;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004932 struct hlist_head *head;
4933
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01004934 if (is_sampling_event(event)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004935 hwc->last_period = hwc->sample_period;
4936 perf_swevent_set_period(event);
4937 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004938
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004939 hwc->state = !(flags & PERF_EF_START);
4940
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004941 head = find_swevent_head(swhash, event);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004942 if (WARN_ON_ONCE(!head))
4943 return -EINVAL;
4944
4945 hlist_add_head_rcu(&event->hlist_entry, head);
4946
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004947 return 0;
4948}
4949
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004950static void perf_swevent_del(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004951{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004952 hlist_del_rcu(&event->hlist_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004953}
4954
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004955static void perf_swevent_start(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02004956{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004957 event->hw.state = 0;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02004958}
4959
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004960static void perf_swevent_stop(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02004961{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004962 event->hw.state = PERF_HES_STOPPED;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02004963}
4964
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004965/* Deref the hlist from the update side */
4966static inline struct swevent_hlist *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004967swevent_hlist_deref(struct swevent_htable *swhash)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004968{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004969 return rcu_dereference_protected(swhash->swevent_hlist,
4970 lockdep_is_held(&swhash->hlist_mutex));
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004971}
4972
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004973static void swevent_hlist_release(struct swevent_htable *swhash)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004974{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004975 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004976
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02004977 if (!hlist)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004978 return;
4979
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004980 rcu_assign_pointer(swhash->swevent_hlist, NULL);
Lai Jiangshanfa4bbc42011-03-18 12:08:29 +08004981 kfree_rcu(hlist, rcu_head);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004982}
4983
4984static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4985{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004986 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004987
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004988 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004989
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004990 if (!--swhash->hlist_refcount)
4991 swevent_hlist_release(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004992
Peter Zijlstrab28ab832010-09-06 14:48:15 +02004993 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02004994}
4995
4996static void swevent_hlist_put(struct perf_event *event)
4997{
4998 int cpu;
4999
5000 if (event->cpu != -1) {
5001 swevent_hlist_put_cpu(event, event->cpu);
5002 return;
5003 }
5004
5005 for_each_possible_cpu(cpu)
5006 swevent_hlist_put_cpu(event, cpu);
5007}
5008
5009static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5010{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005011 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005012 int err = 0;
5013
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005014 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005015
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005016 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005017 struct swevent_hlist *hlist;
5018
5019 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5020 if (!hlist) {
5021 err = -ENOMEM;
5022 goto exit;
5023 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005024 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005025 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005026 swhash->hlist_refcount++;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005027exit:
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005028 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005029
5030 return err;
5031}
5032
5033static int swevent_hlist_get(struct perf_event *event)
5034{
5035 int err;
5036 int cpu, failed_cpu;
5037
5038 if (event->cpu != -1)
5039 return swevent_hlist_get_cpu(event, event->cpu);
5040
5041 get_online_cpus();
5042 for_each_possible_cpu(cpu) {
5043 err = swevent_hlist_get_cpu(event, cpu);
5044 if (err) {
5045 failed_cpu = cpu;
5046 goto fail;
5047 }
5048 }
5049 put_online_cpus();
5050
5051 return 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005052fail:
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005053 for_each_possible_cpu(cpu) {
5054 if (cpu == failed_cpu)
5055 break;
5056 swevent_hlist_put_cpu(event, cpu);
5057 }
5058
5059 put_online_cpus();
5060 return err;
5061}
5062
Jason Barond430d3d2011-03-16 17:29:47 -04005063struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005064
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005065static void sw_perf_event_destroy(struct perf_event *event)
5066{
5067 u64 event_id = event->attr.config;
5068
5069 WARN_ON(event->parent);
5070
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005071 jump_label_dec(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005072 swevent_hlist_put(event);
5073}
5074
5075static int perf_swevent_init(struct perf_event *event)
5076{
5077 int event_id = event->attr.config;
5078
5079 if (event->attr.type != PERF_TYPE_SOFTWARE)
5080 return -ENOENT;
5081
5082 switch (event_id) {
5083 case PERF_COUNT_SW_CPU_CLOCK:
5084 case PERF_COUNT_SW_TASK_CLOCK:
5085 return -ENOENT;
5086
5087 default:
5088 break;
5089 }
5090
Dan Carpenterce677832010-10-24 21:50:42 +02005091 if (event_id >= PERF_COUNT_SW_MAX)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005092 return -ENOENT;
5093
5094 if (!event->parent) {
5095 int err;
5096
5097 err = swevent_hlist_get(event);
5098 if (err)
5099 return err;
5100
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005101 jump_label_inc(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005102 event->destroy = sw_perf_event_destroy;
5103 }
5104
5105 return 0;
5106}
5107
5108static struct pmu perf_swevent = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005109 .task_ctx_nr = perf_sw_context,
5110
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005111 .event_init = perf_swevent_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005112 .add = perf_swevent_add,
5113 .del = perf_swevent_del,
5114 .start = perf_swevent_start,
5115 .stop = perf_swevent_stop,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005116 .read = perf_swevent_read,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005117};
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005118
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005119#ifdef CONFIG_EVENT_TRACING
5120
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005121static int perf_tp_filter_match(struct perf_event *event,
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005122 struct perf_sample_data *data)
5123{
5124 void *record = data->raw->data;
5125
5126 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5127 return 1;
5128 return 0;
5129}
5130
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005131static int perf_tp_event_match(struct perf_event *event,
5132 struct perf_sample_data *data,
5133 struct pt_regs *regs)
5134{
Frederic Weisbeckera0f7d0f2011-03-07 21:27:09 +01005135 if (event->hw.state & PERF_HES_STOPPED)
5136 return 0;
Peter Zijlstra580d6072010-05-20 20:54:31 +02005137 /*
5138 * All tracepoints are from kernel-space.
5139 */
5140 if (event->attr.exclude_kernel)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005141 return 0;
5142
5143 if (!perf_tp_filter_match(event, data))
5144 return 0;
5145
5146 return 1;
5147}
5148
5149void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005150 struct pt_regs *regs, struct hlist_head *head, int rctx)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005151{
5152 struct perf_sample_data data;
5153 struct perf_event *event;
5154 struct hlist_node *node;
5155
5156 struct perf_raw_record raw = {
5157 .size = entry_size,
5158 .data = record,
5159 };
5160
5161 perf_sample_data_init(&data, addr);
5162 data.raw = &raw;
5163
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005164 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5165 if (perf_tp_event_match(event, &data, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02005166 perf_swevent_event(event, count, &data, regs);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005167 }
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005168
5169 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005170}
5171EXPORT_SYMBOL_GPL(perf_tp_event);
5172
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005173static void tp_perf_event_destroy(struct perf_event *event)
5174{
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005175 perf_trace_destroy(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005176}
5177
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005178static int perf_tp_event_init(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005179{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005180 int err;
5181
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005182 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5183 return -ENOENT;
5184
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005185 err = perf_trace_init(event);
5186 if (err)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005187 return err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005188
5189 event->destroy = tp_perf_event_destroy;
5190
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005191 return 0;
5192}
5193
5194static struct pmu perf_tracepoint = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005195 .task_ctx_nr = perf_sw_context,
5196
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005197 .event_init = perf_tp_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005198 .add = perf_trace_add,
5199 .del = perf_trace_del,
5200 .start = perf_swevent_start,
5201 .stop = perf_swevent_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005202 .read = perf_swevent_read,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005203};
5204
5205static inline void perf_tp_register(void)
5206{
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005207 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005208}
Li Zefan6fb29152009-10-15 11:21:42 +08005209
5210static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5211{
5212 char *filter_str;
5213 int ret;
5214
5215 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5216 return -EINVAL;
5217
5218 filter_str = strndup_user(arg, PAGE_SIZE);
5219 if (IS_ERR(filter_str))
5220 return PTR_ERR(filter_str);
5221
5222 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5223
5224 kfree(filter_str);
5225 return ret;
5226}
5227
5228static void perf_event_free_filter(struct perf_event *event)
5229{
5230 ftrace_profile_free_filter(event);
5231}
5232
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005233#else
Li Zefan6fb29152009-10-15 11:21:42 +08005234
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005235static inline void perf_tp_register(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005236{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005237}
Li Zefan6fb29152009-10-15 11:21:42 +08005238
5239static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5240{
5241 return -ENOENT;
5242}
5243
5244static void perf_event_free_filter(struct perf_event *event)
5245{
5246}
5247
Li Zefan07b139c2009-12-21 14:27:35 +08005248#endif /* CONFIG_EVENT_TRACING */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005249
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005250#ifdef CONFIG_HAVE_HW_BREAKPOINT
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005251void perf_bp_event(struct perf_event *bp, void *data)
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005252{
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005253 struct perf_sample_data sample;
5254 struct pt_regs *regs = data;
5255
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005256 perf_sample_data_init(&sample, bp->attr.bp_addr);
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005257
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005258 if (!bp->hw.state && !perf_exclude_event(bp, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02005259 perf_swevent_event(bp, 1, &sample, regs);
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005260}
5261#endif
5262
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005263/*
5264 * hrtimer based swevent callback
5265 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005266
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005267static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005268{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005269 enum hrtimer_restart ret = HRTIMER_RESTART;
5270 struct perf_sample_data data;
5271 struct pt_regs *regs;
5272 struct perf_event *event;
5273 u64 period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005274
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005275 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005276
5277 if (event->state != PERF_EVENT_STATE_ACTIVE)
5278 return HRTIMER_NORESTART;
5279
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005280 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005281
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005282 perf_sample_data_init(&data, 0);
5283 data.period = event->hw.last_period;
5284 regs = get_irq_regs();
5285
5286 if (regs && !perf_exclude_event(event, regs)) {
5287 if (!(event->attr.exclude_idle && current->pid == 0))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02005288 if (perf_event_overflow(event, &data, regs))
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005289 ret = HRTIMER_NORESTART;
5290 }
5291
5292 period = max_t(u64, 10000, event->hw.sample_period);
5293 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5294
5295 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005296}
5297
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005298static void perf_swevent_start_hrtimer(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005299{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005300 struct hw_perf_event *hwc = &event->hw;
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005301 s64 period;
5302
5303 if (!is_sampling_event(event))
5304 return;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005305
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005306 period = local64_read(&hwc->period_left);
5307 if (period) {
5308 if (period < 0)
5309 period = 10000;
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005310
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005311 local64_set(&hwc->period_left, 0);
5312 } else {
5313 period = max_t(u64, 10000, hwc->sample_period);
5314 }
5315 __hrtimer_start_range_ns(&hwc->hrtimer,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005316 ns_to_ktime(period), 0,
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02005317 HRTIMER_MODE_REL_PINNED, 0);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005318}
5319
5320static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5321{
5322 struct hw_perf_event *hwc = &event->hw;
5323
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005324 if (is_sampling_event(event)) {
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005325 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005326 local64_set(&hwc->period_left, ktime_to_ns(remaining));
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005327
5328 hrtimer_cancel(&hwc->hrtimer);
5329 }
5330}
5331
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005332static void perf_swevent_init_hrtimer(struct perf_event *event)
5333{
5334 struct hw_perf_event *hwc = &event->hw;
5335
5336 if (!is_sampling_event(event))
5337 return;
5338
5339 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5340 hwc->hrtimer.function = perf_swevent_hrtimer;
5341
5342 /*
5343 * Since hrtimers have a fixed rate, we can do a static freq->period
5344 * mapping and avoid the whole period adjust feedback stuff.
5345 */
5346 if (event->attr.freq) {
5347 long freq = event->attr.sample_freq;
5348
5349 event->attr.sample_period = NSEC_PER_SEC / freq;
5350 hwc->sample_period = event->attr.sample_period;
5351 local64_set(&hwc->period_left, hwc->sample_period);
5352 event->attr.freq = 0;
5353 }
5354}
5355
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005356/*
5357 * Software event: cpu wall time clock
5358 */
5359
5360static void cpu_clock_event_update(struct perf_event *event)
5361{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005362 s64 prev;
5363 u64 now;
5364
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005365 now = local_clock();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005366 prev = local64_xchg(&event->hw.prev_count, now);
5367 local64_add(now - prev, &event->count);
5368}
5369
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005370static void cpu_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005371{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005372 local64_set(&event->hw.prev_count, local_clock());
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005373 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005374}
5375
5376static void cpu_clock_event_stop(struct perf_event *event, int flags)
5377{
5378 perf_swevent_cancel_hrtimer(event);
5379 cpu_clock_event_update(event);
5380}
5381
5382static int cpu_clock_event_add(struct perf_event *event, int flags)
5383{
5384 if (flags & PERF_EF_START)
5385 cpu_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005386
5387 return 0;
5388}
5389
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005390static void cpu_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005391{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005392 cpu_clock_event_stop(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005393}
5394
5395static void cpu_clock_event_read(struct perf_event *event)
5396{
5397 cpu_clock_event_update(event);
5398}
5399
5400static int cpu_clock_event_init(struct perf_event *event)
5401{
5402 if (event->attr.type != PERF_TYPE_SOFTWARE)
5403 return -ENOENT;
5404
5405 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5406 return -ENOENT;
5407
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005408 perf_swevent_init_hrtimer(event);
5409
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005410 return 0;
5411}
5412
5413static struct pmu perf_cpu_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005414 .task_ctx_nr = perf_sw_context,
5415
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005416 .event_init = cpu_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005417 .add = cpu_clock_event_add,
5418 .del = cpu_clock_event_del,
5419 .start = cpu_clock_event_start,
5420 .stop = cpu_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005421 .read = cpu_clock_event_read,
5422};
5423
5424/*
5425 * Software event: task time clock
5426 */
5427
5428static void task_clock_event_update(struct perf_event *event, u64 now)
5429{
5430 u64 prev;
5431 s64 delta;
5432
5433 prev = local64_xchg(&event->hw.prev_count, now);
5434 delta = now - prev;
5435 local64_add(delta, &event->count);
5436}
5437
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005438static void task_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005439{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005440 local64_set(&event->hw.prev_count, event->ctx->time);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005441 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005442}
5443
5444static void task_clock_event_stop(struct perf_event *event, int flags)
5445{
5446 perf_swevent_cancel_hrtimer(event);
5447 task_clock_event_update(event, event->ctx->time);
5448}
5449
5450static int task_clock_event_add(struct perf_event *event, int flags)
5451{
5452 if (flags & PERF_EF_START)
5453 task_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005454
5455 return 0;
5456}
5457
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005458static void task_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005459{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005460 task_clock_event_stop(event, PERF_EF_UPDATE);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005461}
5462
5463static void task_clock_event_read(struct perf_event *event)
5464{
Peter Zijlstra768a06e2011-02-22 16:52:24 +01005465 u64 now = perf_clock();
5466 u64 delta = now - event->ctx->timestamp;
5467 u64 time = event->ctx->time + delta;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005468
5469 task_clock_event_update(event, time);
5470}
5471
5472static int task_clock_event_init(struct perf_event *event)
5473{
5474 if (event->attr.type != PERF_TYPE_SOFTWARE)
5475 return -ENOENT;
5476
5477 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5478 return -ENOENT;
5479
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005480 perf_swevent_init_hrtimer(event);
5481
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005482 return 0;
5483}
5484
5485static struct pmu perf_task_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005486 .task_ctx_nr = perf_sw_context,
5487
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005488 .event_init = task_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005489 .add = task_clock_event_add,
5490 .del = task_clock_event_del,
5491 .start = task_clock_event_start,
5492 .stop = task_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005493 .read = task_clock_event_read,
5494};
5495
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005496static void perf_pmu_nop_void(struct pmu *pmu)
5497{
5498}
5499
5500static int perf_pmu_nop_int(struct pmu *pmu)
5501{
5502 return 0;
5503}
5504
5505static void perf_pmu_start_txn(struct pmu *pmu)
5506{
5507 perf_pmu_disable(pmu);
5508}
5509
5510static int perf_pmu_commit_txn(struct pmu *pmu)
5511{
5512 perf_pmu_enable(pmu);
5513 return 0;
5514}
5515
5516static void perf_pmu_cancel_txn(struct pmu *pmu)
5517{
5518 perf_pmu_enable(pmu);
5519}
5520
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005521/*
5522 * Ensures all contexts with the same task_ctx_nr have the same
5523 * pmu_cpu_context too.
5524 */
5525static void *find_pmu_context(int ctxn)
5526{
5527 struct pmu *pmu;
5528
5529 if (ctxn < 0)
5530 return NULL;
5531
5532 list_for_each_entry(pmu, &pmus, entry) {
5533 if (pmu->task_ctx_nr == ctxn)
5534 return pmu->pmu_cpu_context;
5535 }
5536
5537 return NULL;
5538}
5539
Peter Zijlstra51676952010-12-07 14:18:20 +01005540static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005541{
Peter Zijlstra51676952010-12-07 14:18:20 +01005542 int cpu;
5543
5544 for_each_possible_cpu(cpu) {
5545 struct perf_cpu_context *cpuctx;
5546
5547 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5548
5549 if (cpuctx->active_pmu == old_pmu)
5550 cpuctx->active_pmu = pmu;
5551 }
5552}
5553
5554static void free_pmu_context(struct pmu *pmu)
5555{
5556 struct pmu *i;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005557
5558 mutex_lock(&pmus_lock);
5559 /*
5560 * Like a real lame refcount.
5561 */
Peter Zijlstra51676952010-12-07 14:18:20 +01005562 list_for_each_entry(i, &pmus, entry) {
5563 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5564 update_pmu_context(i, pmu);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005565 goto out;
Peter Zijlstra51676952010-12-07 14:18:20 +01005566 }
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005567 }
5568
Peter Zijlstra51676952010-12-07 14:18:20 +01005569 free_percpu(pmu->pmu_cpu_context);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005570out:
5571 mutex_unlock(&pmus_lock);
5572}
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005573static struct idr pmu_idr;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005574
Peter Zijlstraabe43402010-11-17 23:17:37 +01005575static ssize_t
5576type_show(struct device *dev, struct device_attribute *attr, char *page)
5577{
5578 struct pmu *pmu = dev_get_drvdata(dev);
5579
5580 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5581}
5582
5583static struct device_attribute pmu_dev_attrs[] = {
5584 __ATTR_RO(type),
5585 __ATTR_NULL,
5586};
5587
5588static int pmu_bus_running;
5589static struct bus_type pmu_bus = {
5590 .name = "event_source",
5591 .dev_attrs = pmu_dev_attrs,
5592};
5593
5594static void pmu_dev_release(struct device *dev)
5595{
5596 kfree(dev);
5597}
5598
5599static int pmu_dev_alloc(struct pmu *pmu)
5600{
5601 int ret = -ENOMEM;
5602
5603 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5604 if (!pmu->dev)
5605 goto out;
5606
5607 device_initialize(pmu->dev);
5608 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5609 if (ret)
5610 goto free_dev;
5611
5612 dev_set_drvdata(pmu->dev, pmu);
5613 pmu->dev->bus = &pmu_bus;
5614 pmu->dev->release = pmu_dev_release;
5615 ret = device_add(pmu->dev);
5616 if (ret)
5617 goto free_dev;
5618
5619out:
5620 return ret;
5621
5622free_dev:
5623 put_device(pmu->dev);
5624 goto out;
5625}
5626
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005627static struct lock_class_key cpuctx_mutex;
Peter Zijlstrafacc4302011-04-09 21:17:42 +02005628static struct lock_class_key cpuctx_lock;
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005629
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005630int perf_pmu_register(struct pmu *pmu, char *name, int type)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005631{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005632 int cpu, ret;
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005633
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005634 mutex_lock(&pmus_lock);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005635 ret = -ENOMEM;
5636 pmu->pmu_disable_count = alloc_percpu(int);
5637 if (!pmu->pmu_disable_count)
5638 goto unlock;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005639
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005640 pmu->type = -1;
5641 if (!name)
5642 goto skip_type;
5643 pmu->name = name;
5644
5645 if (type < 0) {
5646 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5647 if (!err)
5648 goto free_pdc;
5649
5650 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5651 if (err) {
5652 ret = err;
5653 goto free_pdc;
5654 }
5655 }
5656 pmu->type = type;
5657
Peter Zijlstraabe43402010-11-17 23:17:37 +01005658 if (pmu_bus_running) {
5659 ret = pmu_dev_alloc(pmu);
5660 if (ret)
5661 goto free_idr;
5662 }
5663
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005664skip_type:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005665 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5666 if (pmu->pmu_cpu_context)
5667 goto got_cpu_context;
5668
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005669 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5670 if (!pmu->pmu_cpu_context)
Peter Zijlstraabe43402010-11-17 23:17:37 +01005671 goto free_dev;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005672
5673 for_each_possible_cpu(cpu) {
5674 struct perf_cpu_context *cpuctx;
5675
5676 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Peter Zijlstraeb184472010-09-07 15:55:13 +02005677 __perf_event_init_context(&cpuctx->ctx);
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005678 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
Peter Zijlstrafacc4302011-04-09 21:17:42 +02005679 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02005680 cpuctx->ctx.type = cpu_context;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005681 cpuctx->ctx.pmu = pmu;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02005682 cpuctx->jiffies_interval = 1;
5683 INIT_LIST_HEAD(&cpuctx->rotation_list);
Peter Zijlstra51676952010-12-07 14:18:20 +01005684 cpuctx->active_pmu = pmu;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005685 }
5686
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005687got_cpu_context:
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005688 if (!pmu->start_txn) {
5689 if (pmu->pmu_enable) {
5690 /*
5691 * If we have pmu_enable/pmu_disable calls, install
5692 * transaction stubs that use that to try and batch
5693 * hardware accesses.
5694 */
5695 pmu->start_txn = perf_pmu_start_txn;
5696 pmu->commit_txn = perf_pmu_commit_txn;
5697 pmu->cancel_txn = perf_pmu_cancel_txn;
5698 } else {
5699 pmu->start_txn = perf_pmu_nop_void;
5700 pmu->commit_txn = perf_pmu_nop_int;
5701 pmu->cancel_txn = perf_pmu_nop_void;
5702 }
5703 }
5704
5705 if (!pmu->pmu_enable) {
5706 pmu->pmu_enable = perf_pmu_nop_void;
5707 pmu->pmu_disable = perf_pmu_nop_void;
5708 }
5709
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005710 list_add_rcu(&pmu->entry, &pmus);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005711 ret = 0;
5712unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005713 mutex_unlock(&pmus_lock);
5714
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005715 return ret;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005716
Peter Zijlstraabe43402010-11-17 23:17:37 +01005717free_dev:
5718 device_del(pmu->dev);
5719 put_device(pmu->dev);
5720
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005721free_idr:
5722 if (pmu->type >= PERF_TYPE_MAX)
5723 idr_remove(&pmu_idr, pmu->type);
5724
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005725free_pdc:
5726 free_percpu(pmu->pmu_disable_count);
5727 goto unlock;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005728}
5729
5730void perf_pmu_unregister(struct pmu *pmu)
5731{
5732 mutex_lock(&pmus_lock);
5733 list_del_rcu(&pmu->entry);
5734 mutex_unlock(&pmus_lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005735
5736 /*
Peter Zijlstracde8e882010-09-13 11:06:55 +02005737 * We dereference the pmu list under both SRCU and regular RCU, so
5738 * synchronize against both of those.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005739 */
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005740 synchronize_srcu(&pmus_srcu);
Peter Zijlstracde8e882010-09-13 11:06:55 +02005741 synchronize_rcu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005742
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005743 free_percpu(pmu->pmu_disable_count);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005744 if (pmu->type >= PERF_TYPE_MAX)
5745 idr_remove(&pmu_idr, pmu->type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01005746 device_del(pmu->dev);
5747 put_device(pmu->dev);
Peter Zijlstra51676952010-12-07 14:18:20 +01005748 free_pmu_context(pmu);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005749}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005750
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005751struct pmu *perf_init_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005752{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02005753 struct pmu *pmu = NULL;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005754 int idx;
Lin Ming940c5b22011-02-27 21:13:31 +08005755 int ret;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005756
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005757 idx = srcu_read_lock(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005758
5759 rcu_read_lock();
5760 pmu = idr_find(&pmu_idr, event->attr.type);
5761 rcu_read_unlock();
Lin Ming940c5b22011-02-27 21:13:31 +08005762 if (pmu) {
Mark Rutland7e5b2a02011-08-11 12:31:20 +01005763 event->pmu = pmu;
Lin Ming940c5b22011-02-27 21:13:31 +08005764 ret = pmu->event_init(event);
5765 if (ret)
5766 pmu = ERR_PTR(ret);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005767 goto unlock;
Lin Ming940c5b22011-02-27 21:13:31 +08005768 }
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005769
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005770 list_for_each_entry_rcu(pmu, &pmus, entry) {
Mark Rutland7e5b2a02011-08-11 12:31:20 +01005771 event->pmu = pmu;
Lin Ming940c5b22011-02-27 21:13:31 +08005772 ret = pmu->event_init(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005773 if (!ret)
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02005774 goto unlock;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005775
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005776 if (ret != -ENOENT) {
5777 pmu = ERR_PTR(ret);
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02005778 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005779 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005780 }
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02005781 pmu = ERR_PTR(-ENOENT);
5782unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005783 srcu_read_unlock(&pmus_srcu, idx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005784
5785 return pmu;
5786}
5787
5788/*
5789 * Allocate and initialize a event structure
5790 */
5791static struct perf_event *
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02005792perf_event_alloc(struct perf_event_attr *attr, int cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02005793 struct task_struct *task,
5794 struct perf_event *group_leader,
5795 struct perf_event *parent_event,
Avi Kivity4dc0da82011-06-29 18:42:35 +03005796 perf_overflow_handler_t overflow_handler,
5797 void *context)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005798{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02005799 struct pmu *pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005800 struct perf_event *event;
5801 struct hw_perf_event *hwc;
5802 long err;
5803
Oleg Nesterov66832eb2011-01-18 17:10:32 +01005804 if ((unsigned)cpu >= nr_cpu_ids) {
5805 if (!task || cpu != -1)
5806 return ERR_PTR(-EINVAL);
5807 }
5808
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02005809 event = kzalloc(sizeof(*event), GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005810 if (!event)
5811 return ERR_PTR(-ENOMEM);
5812
5813 /*
5814 * Single events are their own group leaders, with an
5815 * empty sibling list:
5816 */
5817 if (!group_leader)
5818 group_leader = event;
5819
5820 mutex_init(&event->child_mutex);
5821 INIT_LIST_HEAD(&event->child_list);
5822
5823 INIT_LIST_HEAD(&event->group_entry);
5824 INIT_LIST_HEAD(&event->event_entry);
5825 INIT_LIST_HEAD(&event->sibling_list);
5826 init_waitqueue_head(&event->waitq);
Peter Zijlstrae360adb2010-10-14 14:01:34 +08005827 init_irq_work(&event->pending, perf_pending_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005828
5829 mutex_init(&event->mmap_mutex);
5830
5831 event->cpu = cpu;
5832 event->attr = *attr;
5833 event->group_leader = group_leader;
5834 event->pmu = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005835 event->oncpu = -1;
5836
5837 event->parent = parent_event;
5838
5839 event->ns = get_pid_ns(current->nsproxy->pid_ns);
5840 event->id = atomic64_inc_return(&perf_event_id);
5841
5842 event->state = PERF_EVENT_STATE_INACTIVE;
5843
Peter Zijlstrad580ff82010-10-14 17:43:23 +02005844 if (task) {
5845 event->attach_state = PERF_ATTACH_TASK;
5846#ifdef CONFIG_HAVE_HW_BREAKPOINT
5847 /*
5848 * hw_breakpoint is a bit difficult here..
5849 */
5850 if (attr->type == PERF_TYPE_BREAKPOINT)
5851 event->hw.bp_target = task;
5852#endif
5853 }
5854
Avi Kivity4dc0da82011-06-29 18:42:35 +03005855 if (!overflow_handler && parent_event) {
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01005856 overflow_handler = parent_event->overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03005857 context = parent_event->overflow_handler_context;
5858 }
Oleg Nesterov66832eb2011-01-18 17:10:32 +01005859
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01005860 event->overflow_handler = overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03005861 event->overflow_handler_context = context;
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02005862
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005863 if (attr->disabled)
5864 event->state = PERF_EVENT_STATE_OFF;
5865
5866 pmu = NULL;
5867
5868 hwc = &event->hw;
5869 hwc->sample_period = attr->sample_period;
5870 if (attr->freq && attr->sample_freq)
5871 hwc->sample_period = 1;
5872 hwc->last_period = hwc->sample_period;
5873
Peter Zijlstrae7850592010-05-21 14:43:08 +02005874 local64_set(&hwc->period_left, hwc->sample_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005875
5876 /*
5877 * we currently do not support PERF_FORMAT_GROUP on inherited events
5878 */
5879 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
5880 goto done;
5881
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005882 pmu = perf_init_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005883
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005884done:
5885 err = 0;
5886 if (!pmu)
5887 err = -EINVAL;
5888 else if (IS_ERR(pmu))
5889 err = PTR_ERR(pmu);
5890
5891 if (err) {
5892 if (event->ns)
5893 put_pid_ns(event->ns);
5894 kfree(event);
5895 return ERR_PTR(err);
5896 }
5897
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005898 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02005899 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02005900 jump_label_inc(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01005901 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005902 atomic_inc(&nr_mmap_events);
5903 if (event->attr.comm)
5904 atomic_inc(&nr_comm_events);
5905 if (event->attr.task)
5906 atomic_inc(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005907 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5908 err = get_callchain_buffers();
5909 if (err) {
5910 free_event(event);
5911 return ERR_PTR(err);
5912 }
5913 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005914 }
5915
5916 return event;
5917}
5918
5919static int perf_copy_attr(struct perf_event_attr __user *uattr,
5920 struct perf_event_attr *attr)
5921{
5922 u32 size;
5923 int ret;
5924
5925 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
5926 return -EFAULT;
5927
5928 /*
5929 * zero the full structure, so that a short copy will be nice.
5930 */
5931 memset(attr, 0, sizeof(*attr));
5932
5933 ret = get_user(size, &uattr->size);
5934 if (ret)
5935 return ret;
5936
5937 if (size > PAGE_SIZE) /* silly large */
5938 goto err_size;
5939
5940 if (!size) /* abi compat */
5941 size = PERF_ATTR_SIZE_VER0;
5942
5943 if (size < PERF_ATTR_SIZE_VER0)
5944 goto err_size;
5945
5946 /*
5947 * If we're handed a bigger struct than we know of,
5948 * ensure all the unknown bits are 0 - i.e. new
5949 * user-space does not rely on any kernel feature
5950 * extensions we dont know about yet.
5951 */
5952 if (size > sizeof(*attr)) {
5953 unsigned char __user *addr;
5954 unsigned char __user *end;
5955 unsigned char val;
5956
5957 addr = (void __user *)uattr + sizeof(*attr);
5958 end = (void __user *)uattr + size;
5959
5960 for (; addr < end; addr++) {
5961 ret = get_user(val, addr);
5962 if (ret)
5963 return ret;
5964 if (val)
5965 goto err_size;
5966 }
5967 size = sizeof(*attr);
5968 }
5969
5970 ret = copy_from_user(attr, uattr, size);
5971 if (ret)
5972 return -EFAULT;
5973
Mahesh Salgaonkarcd757642010-01-30 10:25:18 +05305974 if (attr->__reserved_1)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005975 return -EINVAL;
5976
5977 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
5978 return -EINVAL;
5979
5980 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5981 return -EINVAL;
5982
5983out:
5984 return ret;
5985
5986err_size:
5987 put_user(sizeof(*attr), &uattr->size);
5988 ret = -E2BIG;
5989 goto out;
5990}
5991
Peter Zijlstraac9721f2010-05-27 12:54:41 +02005992static int
5993perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005994{
Frederic Weisbecker76369132011-05-19 19:55:04 +02005995 struct ring_buffer *rb = NULL, *old_rb = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005996 int ret = -EINVAL;
5997
Peter Zijlstraac9721f2010-05-27 12:54:41 +02005998 if (!output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005999 goto set;
6000
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006001 /* don't allow circular references */
6002 if (event == output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006003 goto out;
6004
Peter Zijlstra0f139302010-05-20 14:35:15 +02006005 /*
6006 * Don't allow cross-cpu buffers
6007 */
6008 if (output_event->cpu != event->cpu)
6009 goto out;
6010
6011 /*
Frederic Weisbecker76369132011-05-19 19:55:04 +02006012 * If its not a per-cpu rb, it must be the same task.
Peter Zijlstra0f139302010-05-20 14:35:15 +02006013 */
6014 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6015 goto out;
6016
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006017set:
6018 mutex_lock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006019 /* Can't redirect output if we've got an active mmap() */
6020 if (atomic_read(&event->mmap_count))
6021 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006022
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006023 if (output_event) {
Frederic Weisbecker76369132011-05-19 19:55:04 +02006024 /* get the rb we want to redirect to */
6025 rb = ring_buffer_get(output_event);
6026 if (!rb)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006027 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006028 }
6029
Frederic Weisbecker76369132011-05-19 19:55:04 +02006030 old_rb = event->rb;
6031 rcu_assign_pointer(event->rb, rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006032 ret = 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006033unlock:
6034 mutex_unlock(&event->mmap_mutex);
6035
Frederic Weisbecker76369132011-05-19 19:55:04 +02006036 if (old_rb)
6037 ring_buffer_put(old_rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006038out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006039 return ret;
6040}
6041
6042/**
6043 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6044 *
6045 * @attr_uptr: event_id type attributes for monitoring/sampling
6046 * @pid: target pid
6047 * @cpu: target cpu
6048 * @group_fd: group leader event fd
6049 */
6050SYSCALL_DEFINE5(perf_event_open,
6051 struct perf_event_attr __user *, attr_uptr,
6052 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6053{
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006054 struct perf_event *group_leader = NULL, *output_event = NULL;
6055 struct perf_event *event, *sibling;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006056 struct perf_event_attr attr;
6057 struct perf_event_context *ctx;
6058 struct file *event_file = NULL;
6059 struct file *group_file = NULL;
Matt Helsley38a81da2010-09-13 13:01:20 -07006060 struct task_struct *task = NULL;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006061 struct pmu *pmu;
Al Viroea635c62010-05-26 17:40:29 -04006062 int event_fd;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006063 int move_group = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006064 int fput_needed = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006065 int err;
6066
6067 /* for future expandability... */
Stephane Eraniane5d13672011-02-14 11:20:01 +02006068 if (flags & ~PERF_FLAG_ALL)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006069 return -EINVAL;
6070
6071 err = perf_copy_attr(attr_uptr, &attr);
6072 if (err)
6073 return err;
6074
6075 if (!attr.exclude_kernel) {
6076 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6077 return -EACCES;
6078 }
6079
6080 if (attr.freq) {
6081 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6082 return -EINVAL;
6083 }
6084
Stephane Eraniane5d13672011-02-14 11:20:01 +02006085 /*
6086 * In cgroup mode, the pid argument is used to pass the fd
6087 * opened to the cgroup directory in cgroupfs. The cpu argument
6088 * designates the cpu on which to monitor threads from that
6089 * cgroup.
6090 */
6091 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6092 return -EINVAL;
6093
Al Viroea635c62010-05-26 17:40:29 -04006094 event_fd = get_unused_fd_flags(O_RDWR);
6095 if (event_fd < 0)
6096 return event_fd;
6097
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006098 if (group_fd != -1) {
6099 group_leader = perf_fget_light(group_fd, &fput_needed);
6100 if (IS_ERR(group_leader)) {
6101 err = PTR_ERR(group_leader);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006102 goto err_fd;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006103 }
6104 group_file = group_leader->filp;
6105 if (flags & PERF_FLAG_FD_OUTPUT)
6106 output_event = group_leader;
6107 if (flags & PERF_FLAG_FD_NO_GROUP)
6108 group_leader = NULL;
6109 }
6110
Stephane Eraniane5d13672011-02-14 11:20:01 +02006111 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006112 task = find_lively_task_by_vpid(pid);
6113 if (IS_ERR(task)) {
6114 err = PTR_ERR(task);
6115 goto err_group_fd;
6116 }
6117 }
6118
Avi Kivity4dc0da82011-06-29 18:42:35 +03006119 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6120 NULL, NULL);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006121 if (IS_ERR(event)) {
6122 err = PTR_ERR(event);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006123 goto err_task;
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006124 }
6125
Stephane Eraniane5d13672011-02-14 11:20:01 +02006126 if (flags & PERF_FLAG_PID_CGROUP) {
6127 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6128 if (err)
6129 goto err_alloc;
Peter Zijlstra08309372011-03-03 11:31:20 +01006130 /*
6131 * one more event:
6132 * - that has cgroup constraint on event->cpu
6133 * - that may need work on context switch
6134 */
6135 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6136 jump_label_inc(&perf_sched_events);
Stephane Eraniane5d13672011-02-14 11:20:01 +02006137 }
6138
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006139 /*
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006140 * Special case software events and allow them to be part of
6141 * any hardware group.
6142 */
6143 pmu = event->pmu;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006144
6145 if (group_leader &&
6146 (is_software_event(event) != is_software_event(group_leader))) {
6147 if (is_software_event(event)) {
6148 /*
6149 * If event and group_leader are not both a software
6150 * event, and event is, then group leader is not.
6151 *
6152 * Allow the addition of software events to !software
6153 * groups, this is safe because software events never
6154 * fail to schedule.
6155 */
6156 pmu = group_leader->pmu;
6157 } else if (is_software_event(group_leader) &&
6158 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6159 /*
6160 * In case the group is a pure software group, and we
6161 * try to add a hardware event, move the whole group to
6162 * the hardware context.
6163 */
6164 move_group = 1;
6165 }
6166 }
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006167
6168 /*
6169 * Get the target context (task or percpu):
6170 */
Matt Helsley38a81da2010-09-13 13:01:20 -07006171 ctx = find_get_context(pmu, task, cpu);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006172 if (IS_ERR(ctx)) {
6173 err = PTR_ERR(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006174 goto err_alloc;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006175 }
6176
Peter Zijlstrafd1edb32011-03-28 13:13:56 +02006177 if (task) {
6178 put_task_struct(task);
6179 task = NULL;
6180 }
6181
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006182 /*
6183 * Look up the group leader (we will attach this event to it):
6184 */
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006185 if (group_leader) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006186 err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006187
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006188 /*
6189 * Do not allow a recursive hierarchy (this new sibling
6190 * becoming part of another group-sibling):
6191 */
6192 if (group_leader->group_leader != group_leader)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006193 goto err_context;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006194 /*
6195 * Do not allow to attach to a group in a different
6196 * task or CPU context:
6197 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006198 if (move_group) {
6199 if (group_leader->ctx->type != ctx->type)
6200 goto err_context;
6201 } else {
6202 if (group_leader->ctx != ctx)
6203 goto err_context;
6204 }
6205
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006206 /*
6207 * Only a group leader can be exclusive or pinned
6208 */
6209 if (attr.exclusive || attr.pinned)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006210 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006211 }
6212
6213 if (output_event) {
6214 err = perf_event_set_output(event, output_event);
6215 if (err)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006216 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006217 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006218
Al Viroea635c62010-05-26 17:40:29 -04006219 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6220 if (IS_ERR(event_file)) {
6221 err = PTR_ERR(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006222 goto err_context;
Al Viroea635c62010-05-26 17:40:29 -04006223 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006224
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006225 if (move_group) {
6226 struct perf_event_context *gctx = group_leader->ctx;
6227
6228 mutex_lock(&gctx->mutex);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006229 perf_remove_from_context(group_leader);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006230 list_for_each_entry(sibling, &group_leader->sibling_list,
6231 group_entry) {
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006232 perf_remove_from_context(sibling);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006233 put_ctx(gctx);
6234 }
6235 mutex_unlock(&gctx->mutex);
6236 put_ctx(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006237 }
6238
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006239 event->filp = event_file;
6240 WARN_ON_ONCE(ctx->parent_ctx);
6241 mutex_lock(&ctx->mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006242
6243 if (move_group) {
6244 perf_install_in_context(ctx, group_leader, cpu);
6245 get_ctx(ctx);
6246 list_for_each_entry(sibling, &group_leader->sibling_list,
6247 group_entry) {
6248 perf_install_in_context(ctx, sibling, cpu);
6249 get_ctx(ctx);
6250 }
6251 }
6252
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006253 perf_install_in_context(ctx, event, cpu);
6254 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006255 perf_unpin_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006256 mutex_unlock(&ctx->mutex);
6257
6258 event->owner = current;
Peter Zijlstra88821352010-11-09 19:01:43 +01006259
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006260 mutex_lock(&current->perf_event_mutex);
6261 list_add_tail(&event->owner_entry, &current->perf_event_list);
6262 mutex_unlock(&current->perf_event_mutex);
6263
Peter Zijlstra8a495422010-05-27 15:47:49 +02006264 /*
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006265 * Precalculate sample_data sizes
6266 */
6267 perf_event__header_size(event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006268 perf_event__id_header_size(event);
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006269
6270 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +02006271 * Drop the reference on the group_event after placing the
6272 * new event on the sibling_list. This ensures destruction
6273 * of the group leader will find the pointer to itself in
6274 * perf_group_detach().
6275 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006276 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006277 fd_install(event_fd, event_file);
6278 return event_fd;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006279
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006280err_context:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006281 perf_unpin_context(ctx);
Al Viroea635c62010-05-26 17:40:29 -04006282 put_ctx(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006283err_alloc:
6284 free_event(event);
Peter Zijlstrae7d0bc02010-10-14 16:54:51 +02006285err_task:
6286 if (task)
6287 put_task_struct(task);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006288err_group_fd:
6289 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006290err_fd:
6291 put_unused_fd(event_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006292 return err;
6293}
6294
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006295/**
6296 * perf_event_create_kernel_counter
6297 *
6298 * @attr: attributes of the counter to create
6299 * @cpu: cpu in which the counter is bound
Matt Helsley38a81da2010-09-13 13:01:20 -07006300 * @task: task to profile (NULL for percpu)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006301 */
6302struct perf_event *
6303perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Matt Helsley38a81da2010-09-13 13:01:20 -07006304 struct task_struct *task,
Avi Kivity4dc0da82011-06-29 18:42:35 +03006305 perf_overflow_handler_t overflow_handler,
6306 void *context)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006307{
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006308 struct perf_event_context *ctx;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006309 struct perf_event *event;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006310 int err;
6311
6312 /*
6313 * Get the target context (task or percpu):
6314 */
6315
Avi Kivity4dc0da82011-06-29 18:42:35 +03006316 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6317 overflow_handler, context);
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006318 if (IS_ERR(event)) {
6319 err = PTR_ERR(event);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006320 goto err;
6321 }
6322
Matt Helsley38a81da2010-09-13 13:01:20 -07006323 ctx = find_get_context(event->pmu, task, cpu);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006324 if (IS_ERR(ctx)) {
6325 err = PTR_ERR(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006326 goto err_free;
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006327 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006328
6329 event->filp = NULL;
6330 WARN_ON_ONCE(ctx->parent_ctx);
6331 mutex_lock(&ctx->mutex);
6332 perf_install_in_context(ctx, event, cpu);
6333 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006334 perf_unpin_context(ctx);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006335 mutex_unlock(&ctx->mutex);
6336
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006337 return event;
6338
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006339err_free:
6340 free_event(event);
6341err:
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006342 return ERR_PTR(err);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006343}
6344EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6345
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006346static void sync_child_event(struct perf_event *child_event,
6347 struct task_struct *child)
6348{
6349 struct perf_event *parent_event = child_event->parent;
6350 u64 child_val;
6351
6352 if (child_event->attr.inherit_stat)
6353 perf_event_read_event(child_event, child);
6354
Peter Zijlstrab5e58792010-05-21 14:43:12 +02006355 child_val = perf_event_count(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006356
6357 /*
6358 * Add back the child's count to the parent's count:
6359 */
Peter Zijlstraa6e6dea2010-05-21 14:27:58 +02006360 atomic64_add(child_val, &parent_event->child_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006361 atomic64_add(child_event->total_time_enabled,
6362 &parent_event->child_total_time_enabled);
6363 atomic64_add(child_event->total_time_running,
6364 &parent_event->child_total_time_running);
6365
6366 /*
6367 * Remove this event from the parent's list
6368 */
6369 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6370 mutex_lock(&parent_event->child_mutex);
6371 list_del_init(&child_event->child_list);
6372 mutex_unlock(&parent_event->child_mutex);
6373
6374 /*
6375 * Release the parent event, if this was the last
6376 * reference to it.
6377 */
6378 fput(parent_event->filp);
6379}
6380
6381static void
6382__perf_event_exit_task(struct perf_event *child_event,
6383 struct perf_event_context *child_ctx,
6384 struct task_struct *child)
6385{
Peter Zijlstra38b435b2011-03-15 14:37:10 +01006386 if (child_event->parent) {
6387 raw_spin_lock_irq(&child_ctx->lock);
6388 perf_group_detach(child_event);
6389 raw_spin_unlock_irq(&child_ctx->lock);
6390 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006391
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006392 perf_remove_from_context(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006393
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006394 /*
Peter Zijlstra38b435b2011-03-15 14:37:10 +01006395 * It can happen that the parent exits first, and has events
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006396 * that are still around due to the child reference. These
Peter Zijlstra38b435b2011-03-15 14:37:10 +01006397 * events need to be zapped.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006398 */
Peter Zijlstra38b435b2011-03-15 14:37:10 +01006399 if (child_event->parent) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006400 sync_child_event(child_event, child);
6401 free_event(child_event);
6402 }
6403}
6404
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006405static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006406{
6407 struct perf_event *child_event, *tmp;
6408 struct perf_event_context *child_ctx;
6409 unsigned long flags;
6410
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006411 if (likely(!child->perf_event_ctxp[ctxn])) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006412 perf_event_task(child, NULL, 0);
6413 return;
6414 }
6415
6416 local_irq_save(flags);
6417 /*
6418 * We can't reschedule here because interrupts are disabled,
6419 * and either child is current or it is a task that can't be
6420 * scheduled, so we are now safe from rescheduling changing
6421 * our context.
6422 */
Oleg Nesterov806839b2011-01-21 18:45:47 +01006423 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006424
6425 /*
6426 * Take the context lock here so that if find_get_context is
6427 * reading child->perf_event_ctxp, we wait until it has
6428 * incremented the context's refcount before we do put_ctx below.
6429 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +01006430 raw_spin_lock(&child_ctx->lock);
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02006431 task_ctx_sched_out(child_ctx);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006432 child->perf_event_ctxp[ctxn] = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006433 /*
6434 * If this context is a clone; unclone it so it can't get
6435 * swapped to another process while we're removing all
6436 * the events from it.
6437 */
6438 unclone_ctx(child_ctx);
Peter Zijlstra5e942bb2009-11-23 11:37:26 +01006439 update_context_time(child_ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01006440 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006441
6442 /*
6443 * Report the task dead after unscheduling the events so that we
6444 * won't get any samples after PERF_RECORD_EXIT. We can however still
6445 * get a few PERF_RECORD_READ events.
6446 */
6447 perf_event_task(child, child_ctx, 0);
6448
6449 /*
6450 * We can recurse on the same lock type through:
6451 *
6452 * __perf_event_exit_task()
6453 * sync_child_event()
6454 * fput(parent_event->filp)
6455 * perf_release()
6456 * mutex_lock(&ctx->mutex)
6457 *
6458 * But since its the parent context it won't be the same instance.
6459 */
Peter Zijlstraa0507c82010-05-06 15:42:53 +02006460 mutex_lock(&child_ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006461
6462again:
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006463 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6464 group_entry)
6465 __perf_event_exit_task(child_event, child_ctx, child);
6466
6467 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006468 group_entry)
6469 __perf_event_exit_task(child_event, child_ctx, child);
6470
6471 /*
6472 * If the last event was a group event, it will have appended all
6473 * its siblings to the list, but we obtained 'tmp' before that which
6474 * will still point to the list head terminating the iteration.
6475 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006476 if (!list_empty(&child_ctx->pinned_groups) ||
6477 !list_empty(&child_ctx->flexible_groups))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006478 goto again;
6479
6480 mutex_unlock(&child_ctx->mutex);
6481
6482 put_ctx(child_ctx);
6483}
6484
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006485/*
6486 * When a child task exits, feed back event values to parent events.
6487 */
6488void perf_event_exit_task(struct task_struct *child)
6489{
Peter Zijlstra88821352010-11-09 19:01:43 +01006490 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006491 int ctxn;
6492
Peter Zijlstra88821352010-11-09 19:01:43 +01006493 mutex_lock(&child->perf_event_mutex);
6494 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6495 owner_entry) {
6496 list_del_init(&event->owner_entry);
6497
6498 /*
6499 * Ensure the list deletion is visible before we clear
6500 * the owner, closes a race against perf_release() where
6501 * we need to serialize on the owner->perf_event_mutex.
6502 */
6503 smp_wmb();
6504 event->owner = NULL;
6505 }
6506 mutex_unlock(&child->perf_event_mutex);
6507
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006508 for_each_task_context_nr(ctxn)
6509 perf_event_exit_task_context(child, ctxn);
6510}
6511
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006512static void perf_free_event(struct perf_event *event,
6513 struct perf_event_context *ctx)
6514{
6515 struct perf_event *parent = event->parent;
6516
6517 if (WARN_ON_ONCE(!parent))
6518 return;
6519
6520 mutex_lock(&parent->child_mutex);
6521 list_del_init(&event->child_list);
6522 mutex_unlock(&parent->child_mutex);
6523
6524 fput(parent->filp);
6525
Peter Zijlstra8a495422010-05-27 15:47:49 +02006526 perf_group_detach(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006527 list_del_event(event, ctx);
6528 free_event(event);
6529}
6530
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006531/*
6532 * free an unexposed, unused context as created by inheritance by
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006533 * perf_event_init_task below, used by fork() in case of fail.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006534 */
6535void perf_event_free_task(struct task_struct *task)
6536{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006537 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006538 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006539 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006540
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006541 for_each_task_context_nr(ctxn) {
6542 ctx = task->perf_event_ctxp[ctxn];
6543 if (!ctx)
6544 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006545
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006546 mutex_lock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006547again:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006548 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6549 group_entry)
6550 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006551
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006552 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6553 group_entry)
6554 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006555
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006556 if (!list_empty(&ctx->pinned_groups) ||
6557 !list_empty(&ctx->flexible_groups))
6558 goto again;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006559
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006560 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006561
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006562 put_ctx(ctx);
6563 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006564}
6565
Peter Zijlstra4e231c72010-09-09 21:01:59 +02006566void perf_event_delayed_put(struct task_struct *task)
6567{
6568 int ctxn;
6569
6570 for_each_task_context_nr(ctxn)
6571 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6572}
6573
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006574/*
6575 * inherit a event from parent task to child task:
6576 */
6577static struct perf_event *
6578inherit_event(struct perf_event *parent_event,
6579 struct task_struct *parent,
6580 struct perf_event_context *parent_ctx,
6581 struct task_struct *child,
6582 struct perf_event *group_leader,
6583 struct perf_event_context *child_ctx)
6584{
6585 struct perf_event *child_event;
Peter Zijlstracee010e2010-09-10 12:51:54 +02006586 unsigned long flags;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006587
6588 /*
6589 * Instead of creating recursive hierarchies of events,
6590 * we link inherited events back to the original parent,
6591 * which has a filp for sure, which we use as the reference
6592 * count:
6593 */
6594 if (parent_event->parent)
6595 parent_event = parent_event->parent;
6596
6597 child_event = perf_event_alloc(&parent_event->attr,
6598 parent_event->cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006599 child,
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006600 group_leader, parent_event,
Avi Kivity4dc0da82011-06-29 18:42:35 +03006601 NULL, NULL);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006602 if (IS_ERR(child_event))
6603 return child_event;
6604 get_ctx(child_ctx);
6605
6606 /*
6607 * Make the child state follow the state of the parent event,
6608 * not its attr.disabled bit. We hold the parent's mutex,
6609 * so we won't race with perf_event_{en, dis}able_family.
6610 */
6611 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6612 child_event->state = PERF_EVENT_STATE_INACTIVE;
6613 else
6614 child_event->state = PERF_EVENT_STATE_OFF;
6615
6616 if (parent_event->attr.freq) {
6617 u64 sample_period = parent_event->hw.sample_period;
6618 struct hw_perf_event *hwc = &child_event->hw;
6619
6620 hwc->sample_period = sample_period;
6621 hwc->last_period = sample_period;
6622
6623 local64_set(&hwc->period_left, sample_period);
6624 }
6625
6626 child_event->ctx = child_ctx;
6627 child_event->overflow_handler = parent_event->overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03006628 child_event->overflow_handler_context
6629 = parent_event->overflow_handler_context;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006630
6631 /*
Thomas Gleixner614b6782010-12-03 16:24:32 -02006632 * Precalculate sample_data sizes
6633 */
6634 perf_event__header_size(child_event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006635 perf_event__id_header_size(child_event);
Thomas Gleixner614b6782010-12-03 16:24:32 -02006636
6637 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006638 * Link it up in the child's context:
6639 */
Peter Zijlstracee010e2010-09-10 12:51:54 +02006640 raw_spin_lock_irqsave(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006641 add_event_to_ctx(child_event, child_ctx);
Peter Zijlstracee010e2010-09-10 12:51:54 +02006642 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006643
6644 /*
6645 * Get a reference to the parent filp - we will fput it
6646 * when the child event exits. This is safe to do because
6647 * we are in the parent and we know that the filp still
6648 * exists and has a nonzero count:
6649 */
6650 atomic_long_inc(&parent_event->filp->f_count);
6651
6652 /*
6653 * Link this into the parent event's child list
6654 */
6655 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6656 mutex_lock(&parent_event->child_mutex);
6657 list_add_tail(&child_event->child_list, &parent_event->child_list);
6658 mutex_unlock(&parent_event->child_mutex);
6659
6660 return child_event;
6661}
6662
6663static int inherit_group(struct perf_event *parent_event,
6664 struct task_struct *parent,
6665 struct perf_event_context *parent_ctx,
6666 struct task_struct *child,
6667 struct perf_event_context *child_ctx)
6668{
6669 struct perf_event *leader;
6670 struct perf_event *sub;
6671 struct perf_event *child_ctr;
6672
6673 leader = inherit_event(parent_event, parent, parent_ctx,
6674 child, NULL, child_ctx);
6675 if (IS_ERR(leader))
6676 return PTR_ERR(leader);
6677 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6678 child_ctr = inherit_event(sub, parent, parent_ctx,
6679 child, leader, child_ctx);
6680 if (IS_ERR(child_ctr))
6681 return PTR_ERR(child_ctr);
6682 }
6683 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006684}
6685
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006686static int
6687inherit_task_group(struct perf_event *event, struct task_struct *parent,
6688 struct perf_event_context *parent_ctx,
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006689 struct task_struct *child, int ctxn,
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006690 int *inherited_all)
6691{
6692 int ret;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006693 struct perf_event_context *child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006694
6695 if (!event->attr.inherit) {
6696 *inherited_all = 0;
6697 return 0;
6698 }
6699
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006700 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006701 if (!child_ctx) {
6702 /*
6703 * This is executed from the parent task context, so
6704 * inherit events that have been marked for cloning.
6705 * First allocate and initialize a context for the
6706 * child.
6707 */
6708
Peter Zijlstraeb184472010-09-07 15:55:13 +02006709 child_ctx = alloc_perf_context(event->pmu, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006710 if (!child_ctx)
6711 return -ENOMEM;
6712
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006713 child->perf_event_ctxp[ctxn] = child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006714 }
6715
6716 ret = inherit_group(event, parent, parent_ctx,
6717 child, child_ctx);
6718
6719 if (ret)
6720 *inherited_all = 0;
6721
6722 return ret;
6723}
6724
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006725/*
6726 * Initialize the perf_event context in task_struct
6727 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006728int perf_event_init_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006729{
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006730 struct perf_event_context *child_ctx, *parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006731 struct perf_event_context *cloned_ctx;
6732 struct perf_event *event;
6733 struct task_struct *parent = current;
6734 int inherited_all = 1;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01006735 unsigned long flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006736 int ret = 0;
6737
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006738 if (likely(!parent->perf_event_ctxp[ctxn]))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006739 return 0;
6740
6741 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006742 * If the parent's context is a clone, pin it so it won't get
6743 * swapped under us.
6744 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006745 parent_ctx = perf_pin_task_context(parent, ctxn);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006746
6747 /*
6748 * No need to check if parent_ctx != NULL here; since we saw
6749 * it non-NULL earlier, the only reason for it to become NULL
6750 * is if we exit, and since we're currently in the middle of
6751 * a fork we can't be exiting at the same time.
6752 */
6753
6754 /*
6755 * Lock the parent list. No need to lock the child - not PID
6756 * hashed yet and not running, so nobody can access it.
6757 */
6758 mutex_lock(&parent_ctx->mutex);
6759
6760 /*
6761 * We dont have to disable NMIs - we are only looking at
6762 * the list, not manipulating it:
6763 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006764 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006765 ret = inherit_task_group(event, parent, parent_ctx,
6766 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006767 if (ret)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006768 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006769 }
6770
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01006771 /*
6772 * We can't hold ctx->lock when iterating the ->flexible_group list due
6773 * to allocations, but we need to prevent rotation because
6774 * rotate_ctx() will change the list from interrupt context.
6775 */
6776 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6777 parent_ctx->rotate_disable = 1;
6778 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6779
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006780 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006781 ret = inherit_task_group(event, parent, parent_ctx,
6782 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006783 if (ret)
6784 break;
6785 }
6786
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01006787 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6788 parent_ctx->rotate_disable = 0;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01006789
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006790 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006791
Peter Zijlstra05cbaa22009-12-30 16:00:35 +01006792 if (child_ctx && inherited_all) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006793 /*
6794 * Mark the child context as a clone of the parent
6795 * context, or of whatever the parent is a clone of.
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01006796 *
6797 * Note that if the parent is a clone, the holding of
6798 * parent_ctx->lock avoids it from being uncloned.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006799 */
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01006800 cloned_ctx = parent_ctx->parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006801 if (cloned_ctx) {
6802 child_ctx->parent_ctx = cloned_ctx;
6803 child_ctx->parent_gen = parent_ctx->parent_gen;
6804 } else {
6805 child_ctx->parent_ctx = parent_ctx;
6806 child_ctx->parent_gen = parent_ctx->generation;
6807 }
6808 get_ctx(child_ctx->parent_ctx);
6809 }
6810
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01006811 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006812 mutex_unlock(&parent_ctx->mutex);
6813
6814 perf_unpin_context(parent_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006815 put_ctx(parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006816
6817 return ret;
6818}
6819
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006820/*
6821 * Initialize the perf_event context in task_struct
6822 */
6823int perf_event_init_task(struct task_struct *child)
6824{
6825 int ctxn, ret;
6826
Oleg Nesterov8550d7c2011-01-19 19:22:28 +01006827 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
6828 mutex_init(&child->perf_event_mutex);
6829 INIT_LIST_HEAD(&child->perf_event_list);
6830
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006831 for_each_task_context_nr(ctxn) {
6832 ret = perf_event_init_context(child, ctxn);
6833 if (ret)
6834 return ret;
6835 }
6836
6837 return 0;
6838}
6839
Paul Mackerras220b1402010-03-10 20:45:52 +11006840static void __init perf_event_init_all_cpus(void)
6841{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006842 struct swevent_htable *swhash;
Paul Mackerras220b1402010-03-10 20:45:52 +11006843 int cpu;
Paul Mackerras220b1402010-03-10 20:45:52 +11006844
6845 for_each_possible_cpu(cpu) {
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006846 swhash = &per_cpu(swevent_htable, cpu);
6847 mutex_init(&swhash->hlist_mutex);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006848 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
Paul Mackerras220b1402010-03-10 20:45:52 +11006849 }
6850}
6851
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006852static void __cpuinit perf_event_init_cpu(int cpu)
6853{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006854 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006855
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006856 mutex_lock(&swhash->hlist_mutex);
Peter Zijlstra144060f2011-08-01 12:49:14 +02006857 if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006858 struct swevent_hlist *hlist;
6859
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006860 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
6861 WARN_ON(!hlist);
6862 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006863 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006864 mutex_unlock(&swhash->hlist_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006865}
6866
Peter Zijlstrac2774432010-12-08 15:29:02 +01006867#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006868static void perf_pmu_rotate_stop(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006869{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006870 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6871
6872 WARN_ON(!irqs_disabled());
6873
6874 list_del_init(&cpuctx->rotation_list);
6875}
6876
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006877static void __perf_event_exit_context(void *__info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006878{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006879 struct perf_event_context *ctx = __info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006880 struct perf_event *event, *tmp;
6881
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006882 perf_pmu_rotate_stop(ctx->pmu);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02006883
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006884 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006885 __perf_remove_from_context(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006886 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006887 __perf_remove_from_context(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006888}
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006889
6890static void perf_event_exit_cpu_context(int cpu)
6891{
6892 struct perf_event_context *ctx;
6893 struct pmu *pmu;
6894 int idx;
6895
6896 idx = srcu_read_lock(&pmus_srcu);
6897 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra917bdd12010-09-17 11:28:49 +02006898 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006899
6900 mutex_lock(&ctx->mutex);
6901 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6902 mutex_unlock(&ctx->mutex);
6903 }
6904 srcu_read_unlock(&pmus_srcu, idx);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006905}
6906
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006907static void perf_event_exit_cpu(int cpu)
6908{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006909 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006910
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006911 mutex_lock(&swhash->hlist_mutex);
6912 swevent_hlist_release(swhash);
6913 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006914
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006915 perf_event_exit_cpu_context(cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006916}
6917#else
6918static inline void perf_event_exit_cpu(int cpu) { }
6919#endif
6920
Peter Zijlstrac2774432010-12-08 15:29:02 +01006921static int
6922perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6923{
6924 int cpu;
6925
6926 for_each_online_cpu(cpu)
6927 perf_event_exit_cpu(cpu);
6928
6929 return NOTIFY_OK;
6930}
6931
6932/*
6933 * Run the perf reboot notifier at the very last possible moment so that
6934 * the generic watchdog code runs as long as possible.
6935 */
6936static struct notifier_block perf_reboot_notifier = {
6937 .notifier_call = perf_reboot,
6938 .priority = INT_MIN,
6939};
6940
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006941static int __cpuinit
6942perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6943{
6944 unsigned int cpu = (long)hcpu;
6945
Peter Zijlstra144060f2011-08-01 12:49:14 +02006946 /*
6947 * Ignore suspend/resume action, the perf_pm_notifier will
6948 * take care of that.
6949 */
6950 if (action & CPU_TASKS_FROZEN)
6951 return NOTIFY_OK;
6952
6953 switch (action) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006954
6955 case CPU_UP_PREPARE:
Peter Zijlstra5e116372010-06-11 13:35:08 +02006956 case CPU_DOWN_FAILED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006957 perf_event_init_cpu(cpu);
6958 break;
6959
Peter Zijlstra5e116372010-06-11 13:35:08 +02006960 case CPU_UP_CANCELED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006961 case CPU_DOWN_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006962 perf_event_exit_cpu(cpu);
6963 break;
6964
6965 default:
6966 break;
6967 }
6968
6969 return NOTIFY_OK;
6970}
6971
Peter Zijlstra144060f2011-08-01 12:49:14 +02006972static void perf_pm_resume_cpu(void *unused)
6973{
6974 struct perf_cpu_context *cpuctx;
6975 struct perf_event_context *ctx;
6976 struct pmu *pmu;
6977 int idx;
6978
6979 idx = srcu_read_lock(&pmus_srcu);
6980 list_for_each_entry_rcu(pmu, &pmus, entry) {
6981 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6982 ctx = cpuctx->task_ctx;
6983
6984 perf_ctx_lock(cpuctx, ctx);
6985 perf_pmu_disable(cpuctx->ctx.pmu);
6986
6987 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
6988 if (ctx)
6989 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
6990
6991 perf_pmu_enable(cpuctx->ctx.pmu);
6992 perf_ctx_unlock(cpuctx, ctx);
6993 }
6994 srcu_read_unlock(&pmus_srcu, idx);
6995}
6996
6997static void perf_pm_suspend_cpu(void *unused)
6998{
6999 struct perf_cpu_context *cpuctx;
7000 struct perf_event_context *ctx;
7001 struct pmu *pmu;
7002 int idx;
7003
7004 idx = srcu_read_lock(&pmus_srcu);
7005 list_for_each_entry_rcu(pmu, &pmus, entry) {
7006 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7007 ctx = cpuctx->task_ctx;
7008
7009 perf_ctx_lock(cpuctx, ctx);
7010 perf_pmu_disable(cpuctx->ctx.pmu);
7011
7012 perf_event_sched_in(cpuctx, ctx, current);
7013
7014 perf_pmu_enable(cpuctx->ctx.pmu);
7015 perf_ctx_unlock(cpuctx, ctx);
7016 }
7017 srcu_read_unlock(&pmus_srcu, idx);
7018}
7019
7020static int perf_resume(void)
7021{
7022 get_online_cpus();
7023 smp_call_function(perf_pm_resume_cpu, NULL, 1);
7024 put_online_cpus();
7025
7026 return NOTIFY_OK;
7027}
7028
7029static int perf_suspend(void)
7030{
7031 get_online_cpus();
7032 smp_call_function(perf_pm_suspend_cpu, NULL, 1);
7033 put_online_cpus();
7034
7035 return NOTIFY_OK;
7036}
7037
7038static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
7039{
7040 switch (action) {
7041 case PM_POST_HIBERNATION:
7042 case PM_POST_SUSPEND:
7043 return perf_resume();
7044 case PM_HIBERNATION_PREPARE:
7045 case PM_SUSPEND_PREPARE:
7046 return perf_suspend();
7047 default:
7048 return NOTIFY_DONE;
7049 }
7050}
7051
7052static struct notifier_block perf_pm_notifier = {
7053 .notifier_call = perf_pm,
7054};
7055
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007056void __init perf_event_init(void)
7057{
Jason Wessel3c502e72010-11-04 17:33:01 -05007058 int ret;
7059
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007060 idr_init(&pmu_idr);
7061
Paul Mackerras220b1402010-03-10 20:45:52 +11007062 perf_event_init_all_cpus();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007063 init_srcu_struct(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007064 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7065 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7066 perf_pmu_register(&perf_task_clock, NULL, -1);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007067 perf_tp_register();
7068 perf_cpu_notifier(perf_cpu_notify);
Peter Zijlstrac2774432010-12-08 15:29:02 +01007069 register_reboot_notifier(&perf_reboot_notifier);
Peter Zijlstra144060f2011-08-01 12:49:14 +02007070 register_pm_notifier(&perf_pm_notifier);
Jason Wessel3c502e72010-11-04 17:33:01 -05007071
7072 ret = init_hw_breakpoint();
7073 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007074}
Peter Zijlstraabe43402010-11-17 23:17:37 +01007075
7076static int __init perf_event_sysfs_init(void)
7077{
7078 struct pmu *pmu;
7079 int ret;
7080
7081 mutex_lock(&pmus_lock);
7082
7083 ret = bus_register(&pmu_bus);
7084 if (ret)
7085 goto unlock;
7086
7087 list_for_each_entry(pmu, &pmus, entry) {
7088 if (!pmu->name || pmu->type < 0)
7089 continue;
7090
7091 ret = pmu_dev_alloc(pmu);
7092 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7093 }
7094 pmu_bus_running = 1;
7095 ret = 0;
7096
7097unlock:
7098 mutex_unlock(&pmus_lock);
7099
7100 return ret;
7101}
7102device_initcall(perf_event_sysfs_init);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007103
7104#ifdef CONFIG_CGROUP_PERF
7105static struct cgroup_subsys_state *perf_cgroup_create(
7106 struct cgroup_subsys *ss, struct cgroup *cont)
7107{
7108 struct perf_cgroup *jc;
Stephane Eraniane5d13672011-02-14 11:20:01 +02007109
Li Zefan1b15d052011-03-03 14:26:06 +08007110 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007111 if (!jc)
7112 return ERR_PTR(-ENOMEM);
7113
Stephane Eraniane5d13672011-02-14 11:20:01 +02007114 jc->info = alloc_percpu(struct perf_cgroup_info);
7115 if (!jc->info) {
7116 kfree(jc);
7117 return ERR_PTR(-ENOMEM);
7118 }
7119
Stephane Eraniane5d13672011-02-14 11:20:01 +02007120 return &jc->css;
7121}
7122
7123static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7124 struct cgroup *cont)
7125{
7126 struct perf_cgroup *jc;
7127 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7128 struct perf_cgroup, css);
7129 free_percpu(jc->info);
7130 kfree(jc);
7131}
7132
7133static int __perf_cgroup_move(void *info)
7134{
7135 struct task_struct *task = info;
7136 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7137 return 0;
7138}
7139
Peter Zijlstra74c355f2011-05-30 16:48:06 +02007140static void
7141perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
Stephane Eraniane5d13672011-02-14 11:20:01 +02007142{
7143 task_function_call(task, __perf_cgroup_move, task);
7144}
7145
Stephane Eraniane5d13672011-02-14 11:20:01 +02007146static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7147 struct cgroup *old_cgrp, struct task_struct *task)
7148{
7149 /*
7150 * cgroup_exit() is called in the copy_process() failure path.
7151 * Ignore this case since the task hasn't ran yet, this avoids
7152 * trying to poke a half freed task state from generic code.
7153 */
7154 if (!(task->flags & PF_EXITING))
7155 return;
7156
Peter Zijlstra74c355f2011-05-30 16:48:06 +02007157 perf_cgroup_attach_task(cgrp, task);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007158}
7159
7160struct cgroup_subsys perf_subsys = {
Ingo Molnare7e7ee22011-05-04 08:42:29 +02007161 .name = "perf_event",
7162 .subsys_id = perf_subsys_id,
7163 .create = perf_cgroup_create,
7164 .destroy = perf_cgroup_destroy,
7165 .exit = perf_cgroup_exit,
Peter Zijlstra74c355f2011-05-30 16:48:06 +02007166 .attach_task = perf_cgroup_attach_task,
Stephane Eraniane5d13672011-02-14 11:20:01 +02007167};
7168#endif /* CONFIG_CGROUP_PERF */