aboutsummaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c157
1 files changed, 130 insertions, 27 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac9b8cc..459b94c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,25 +165,109 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
/*
* max perf event sample rate
*/
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
- DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static atomic_t perf_sample_allowed_ns __read_mostly =
+ ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ atomic_set(&perf_sample_allowed_ns, tmp);
+}
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
return 0;
}
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (atomic_read(&perf_sample_allowed_ns) == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __get_cpu_var(running_sample_length) = local_samples_len;
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ printk_ratelimited(KERN_WARNING
+ "perf samples too long (%lld > %d), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len,
+ atomic_read(&perf_sample_allowed_ns),
+ sysctl_perf_event_sample_rate);
+
+ update_perf_cpu_limits();
+}
+
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -1237,6 +1321,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1245,12 +1334,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1275,10 +1367,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1287,12 +1383,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1309,6 +1405,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -3015,10 +3113,7 @@ int perf_event_release_kernel(struct perf_event *event)
* to trigger the AB-BA case.
*/
mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
+ perf_remove_from_context(event, true);
mutex_unlock(&ctx->mutex);
free_event(event);
@@ -5044,6 +5139,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5290,8 +5388,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6581,6 +6685,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -6727,7 +6834,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
/*
* Removing from the context ends up with disabled
@@ -6737,7 +6844,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
perf_event__state_init(sibling);
put_ctx(gctx);
}
@@ -6867,7 +6974,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock(&src_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event);
+ perf_remove_from_context(event, false);
put_ctx(src_ctx);
list_add(&event->event_entry, &events);
}
@@ -6927,13 +7034,7 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -7395,6 +7496,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -7417,14 +7519,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event;
perf_pmu_rotate_stop(ctx->pmu);
rcu_read_lock();
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event);
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
rcu_read_unlock();
}
@@ -7452,6 +7554,7 @@ static void perf_event_exit_cpu(int cpu)
perf_event_exit_cpu_context(cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}