aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c48
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/hw_breakpoint.c196
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/lockdep.c88
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c14
-rw-r--r--kernel/perf_event.c377
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched.c771
-rw-r--r--kernel/sched_debug.c108
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c3
-rw-r--r--kernel/trace/trace.h20
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_kprobe.c535
-rw-r--r--kernel/trace/trace_ksym.c26
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c57
-rw-r--r--kernel/user.c11
51 files changed, 2236 insertions, 2147 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b..149e18ef1ab 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48..e4c0e1fee9b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct;
+ struct bsd_acct_struct *acct = ns->bacct;
- spin_lock(&acct_lock);
- acct = ns->bacct;
- if (acct != NULL) {
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
+ if (acct == NULL)
+ return;
- kfree(acct);
- }
+ del_timer_sync(&acct->timer);
+ spin_lock(&acct_lock);
+ if (acct->file != NULL)
+ acct_file_reopen(acct, NULL, NULL);
spin_unlock(&acct_lock);
+
+ kfree(acct);
}
/*
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b27..2f05303715a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <asm/uaccess.h>
-#include "cred-internals.h"
/*
* Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a53c771e50..e9ec642932e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3016,7 +3016,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
- remove_wait_queue_locked(event->wqh, &event->wait);
+ __remove_wait_queue(event->wqh, &event->wait);
spin_lock(&cgrp->event_list_lock);
list_del(&event->list);
spin_unlock(&cgrp->event_list_lock);
@@ -4435,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
*/
unsigned short css_id(struct cgroup_subsys_state *css)
{
- struct css_id *cssid = rcu_dereference(css->id);
+ struct css_id *cssid;
+
+ /*
+ * This css_id() can return correct value when somone has refcnt
+ * on this or this is under rcu_read_lock(). Once css->id is allocated,
+ * it's unchanged until freed.
+ */
+ cssid = rcu_dereference_check(css->id,
+ rcu_read_lock_held() || atomic_read(&css->refcnt));
if (cssid)
return cssid->id;
@@ -4445,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
unsigned short css_depth(struct cgroup_subsys_state *css)
{
- struct css_id *cssid = rcu_dereference(css->id);
+ struct css_id *cssid;
+
+ cssid = rcu_dereference_check(css->id,
+ rcu_read_lock_held() || atomic_read(&css->refcnt));
if (cssid)
return cssid->depth;
@@ -4453,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
}
EXPORT_SYMBOL_GPL(css_depth);
+/**
+ * css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
+
bool css_is_ancestor(struct cgroup_subsys_state *child,
const struct cgroup_subsys_state *root)
{
- struct css_id *child_id = rcu_dereference(child->id);
- struct css_id *root_id = rcu_dereference(root->id);
+ struct css_id *child_id;
+ struct css_id *root_id;
+ bool ret = true;
- if (!child_id || !root_id || (child_id->depth < root_id->depth))
- return false;
- return child_id->stack[root_id->depth] == root_id->id;
+ rcu_read_lock();
+ child_id = rcu_dereference(child->id);
+ root_id = rcu_dereference(root->id);
+ if (!child_id
+ || !root_id
+ || (child_id->depth < root_id->depth)
+ || (child_id->stack[root_id->depth] != root_id->id))
+ ret = false;
+ rcu_read_unlock();
+ return ret;
}
static void __free_css_id_cb(struct rcu_head *head)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be..54577757477 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
}
struct take_cpu_down_param {
+ struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
+ unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
+ if (task_cpu(param->caller) == cpu)
+ move_task_off_dead_cpu(cpu, param->caller);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
- cpumask_var_t old_allowed;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
+ .caller = current,
.mod = mod,
.hcpu = hcpu,
};
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
- return -ENOMEM;
-
cpu_hotplug_begin();
set_cpu_active(cpu, false);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
goto out_release;
}
- /* Ensure that we are not runnable on dying cpu */
- cpumask_copy(old_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpu_active_mask);
-
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
hcpu) == NOTIFY_BAD)
BUG();
- goto out_allowed;
+ goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
check_for_tasks(cpu);
-out_allowed:
- set_cpus_allowed_ptr(current, old_allowed);
out_release:
cpu_hotplug_done();
if (!err) {
@@ -264,7 +259,6 @@ out_release:
hcpu) == NOTIFY_BAD)
BUG();
}
- free_cpumask_var(old_allowed);
return err;
}
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
{
int err;
- err = stop_machine_create();
- if (err)
- return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
out:
cpu_maps_update_done();
- stop_machine_destroy();
return err;
}
EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error;
- error = stop_machine_create();
- if (error)
- return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
/*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
- stop_machine_destroy();
return error;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec..9a50c5f6e72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
mutex_lock(&callback_mutex);
- cpuset_cpus_allowed_locked(tsk, pmask);
+ task_lock(tsk);
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ task_unlock(tsk);
mutex_unlock(&callback_mutex);
}
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ const struct cpuset *cs;
+ int cpu;
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+ if (cs)
+ cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ */
+
+ cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Either tsk->cpus_allowed is wrong (see above) or it
+ * is actually empty. The latter case is only possible
+ * if we are racing with remove_tasks_in_empty_cpuset().
+ * Like above we can temporary set any mask and rely on
+ * set_cpus_allowed_ptr() as synchronization point.
+ */
+ cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ cpu = cpumask_any(cpu_active_mask);
+ }
+
+ return cpu;
}
void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
* cpuset_unlock - release lock on cpuset changes
*
* Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf..00000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
- sched_move_task(p);
-#endif /* CONFIG_USER_SCHED */
-}
-
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c23..8f3672a58a1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/cn_proc.h>
-#include "cred-internals.h"
#if 0
#define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
- sched_switch_user(task);
-
/* send notifications */
if (new->uid != old->uid ||
new->euid != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac..eabca5a73a8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-#include "cred-internals.h"
static void exit_mm(struct task_struct * tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2..4d57d9e3a6e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,10 +1112,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.memcg = NULL;
#endif
- p->bts = NULL;
-
- p->stack_start = stack_start;
-
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a..7a56b22e060 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
+
/*
* Constraints data
*/
/* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
+
+static int nr_slots[TYPE_MAX];
+
+static int constraints_initialized;
/* Gather the number of total pinned and un-pinned bp in a cpuset */
struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
/* Serialize accesses to the above constraints */
static DEFINE_MUTEX(nr_bp_mutex);
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+ return 1;
+}
+
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+ if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+ return TYPE_DATA;
+
+ return TYPE_INST;
+}
+
/*
* Report the maximum number of pinned breakpoints a task
* have in this cpu
*/
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
int i;
- unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+ unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
- for (i = HBP_NUM -1; i >= 0; i--) {
+ for (i = nr_slots[type] - 1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
return i + 1;
}
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
return 0;
}
-static int task_bp_pinned(struct task_struct *tsk)
+static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
{
struct perf_event_context *ctx = tsk->perf_event_ctxp;
struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
*/
list_for_each_entry(bp, list, event_entry) {
if (bp->attr.type == PERF_TYPE_BREAKPOINT)
- count++;
+ if (find_slot_idx(bp) == type)
+ count += hw_breakpoint_weight(bp);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
* a given cpu (cpu > -1) or in all of them (cpu = -1).
*/
static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+ enum bp_type_idx type)
{
int cpu = bp->cpu;
struct task_struct *tsk = bp->ctx->task;
if (cpu >= 0) {
- slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+ slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
if (!tsk)
- slots->pinned += max_task_bp_pinned(cpu);
+ slots->pinned += max_task_bp_pinned(cpu, type);
else
- slots->pinned += task_bp_pinned(tsk);
- slots->flexible = per_cpu(nr_bp_flexible, cpu);
+ slots->pinned += task_bp_pinned(tsk, type);
+ slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
return;
}
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
for_each_online_cpu(cpu) {
unsigned int nr;
- nr = per_cpu(nr_cpu_bp_pinned, cpu);
+ nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
if (!tsk)
- nr += max_task_bp_pinned(cpu);
+ nr += max_task_bp_pinned(cpu, type);
else
- nr += task_bp_pinned(tsk);
+ nr += task_bp_pinned(tsk, type);
if (nr > slots->pinned)
slots->pinned = nr;
- nr = per_cpu(nr_bp_flexible, cpu);
+ nr = per_cpu(nr_bp_flexible[type], cpu);
if (nr > slots->flexible)
slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
}
/*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+ slots->pinned += weight;
+}
+
+/*
* Add a pinned breakpoint for the given task in our constraint table
*/
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+ enum bp_type_idx type, int weight)
{
unsigned int *tsk_pinned;
- int count = 0;
+ int old_count = 0;
+ int old_idx = 0;
+ int idx = 0;
- count = task_bp_pinned(tsk);
+ old_count = task_bp_pinned(tsk, type);
+ old_idx = old_count - 1;
+ idx = old_idx + weight;
- tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+ tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
if (enable) {
- tsk_pinned[count]++;
- if (count > 0)
- tsk_pinned[count-1]--;
+ tsk_pinned[idx]++;
+ if (old_count > 0)
+ tsk_pinned[old_idx]--;
} else {
- tsk_pinned[count]--;
- if (count > 0)
- tsk_pinned[count-1]++;
+ tsk_pinned[idx]--;
+ if (old_count > 0)
+ tsk_pinned[old_idx]++;
}
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+ int weight)
{
int cpu = bp->cpu;
struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
/* Pinned counter task profiling */
if (tsk) {
if (cpu >= 0) {
- toggle_bp_task_slot(tsk, cpu, enable);
+ toggle_bp_task_slot(tsk, cpu, enable, type, weight);
return;
}
for_each_online_cpu(cpu)
- toggle_bp_task_slot(tsk, cpu, enable);
+ toggle_bp_task_slot(tsk, cpu, enable, type, weight);
return;
}
/* Pinned counter cpu profiling */
if (enable)
- per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+ per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
else
- per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+ per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
}
/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
static int __reserve_bp_slot(struct perf_event *bp)
{
struct bp_busy_slots slots = {0};
+ enum bp_type_idx type;
+ int weight;
- fetch_bp_busy_slots(&slots, bp);
+ /* We couldn't initialize breakpoint constraints on boot */
+ if (!constraints_initialized)
+ return -ENOMEM;
+
+ /* Basic checks */
+ if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+ bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ return -EINVAL;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+
+ fetch_bp_busy_slots(&slots, bp, type);
+ fetch_this_slot(&slots, weight);
/* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) == HBP_NUM)
+ if (slots.pinned + (!!slots.flexible) > nr_slots[type])
return -ENOSPC;
- toggle_bp_slot(bp, true);
+ toggle_bp_slot(bp, true, type, weight);
return 0;
}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
static void __release_bp_slot(struct perf_event *bp)
{
- toggle_bp_slot(bp, false);
+ enum bp_type_idx type;
+ int weight;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+ toggle_bp_slot(bp, false, type, weight);
}
void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
return 0;
}
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = arch_validate_hwbkpt_settings(bp);
+ if (ret)
+ return ret;
+
+ if (arch_check_bp_in_kernelspace(bp)) {
+ if (bp->attr.exclude_kernel)
+ return -EINVAL;
+ /*
+ * Don't let unprivileged users set a breakpoint in the trap
+ * path to avoid trap recursion attacks.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
int register_perf_hw_breakpoint(struct perf_event *bp)
{
int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
if (ret)
return ret;
- /*
- * Ptrace breakpoints can be temporary perf events only
- * meant to reserve a slot. In this case, it is created disabled and
- * we don't want to check the params right now (as we put a null addr)
- * But perf tools create events as disabled and we want to check
- * the params for them.
- * This is a quick hack that will be removed soon, once we remove
- * the tmp breakpoints from ptrace
- */
- if (!bp->attr.disabled || !bp->overflow_handler)
- ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+ ret = validate_hw_breakpoint(bp);
/* if arch_validate_hwbkpt_settings() fails then release bp slot */
if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
if (attr->disabled)
goto end;
- err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+ err = validate_hw_breakpoint(bp);
if (!err)
perf_event_enable(bp);
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
static int __init init_hw_breakpoint(void)
{
+ unsigned int **task_bp_pinned;
+ int cpu, err_cpu;
+ int i;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ nr_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+ *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+ GFP_KERNEL);
+ if (!*task_bp_pinned)
+ goto err_alloc;
+ }
+ }
+
+ constraints_initialized = 1;
+
return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+ for_each_possible_cpu(err_cpu) {
+ if (err_cpu == cpu)
+ break;
+ for (i = 0; i < TYPE_MAX; i++)
+ kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+ }
+
+ return -ENOMEM;
}
core_initcall(init_hw_breakpoint);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc47..474a84715ea 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
free_reserved_phys_range(end, crashk_res.end);
- if (start == end) {
- crashk_res.end = end;
+ if (start == end)
release_resource(&crashk_res);
- } else
- crashk_res.end = end - 1;
+ crashk_res.end = end - 1;
unlock:
mutex_unlock(&kexec_mutex);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e..282035f3ae9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
arch_remove_kprobe(p);
}
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If the probe is already disabled (or gone), just return */
+ if (kprobe_disabled(kp))
+ goto out;
+
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ /* When kp != p, p is always enabled. */
+ try_to_disable_aggr_kprobe(p);
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p))
+ disarm_kprobe(p);
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (kprobe_gone(kp)) {
+ /* This kprobe has gone, we couldn't enable it. */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (p != kp)
+ kp->flags &= ~KPROBE_FLAG_DISABLED;
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ arm_kprobe(p);
+ }
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
void __kprobes dump_kprobe(struct kprobe *kp)
{
printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
-{
- int ret = 0;
- struct kprobe *p;
-
- mutex_lock(&kprobe_mutex);
-
- /* Check whether specified probe is valid. */
- p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
- ret = -EINVAL;
- goto out;
- }
-
- /* If the probe is already disabled (or gone), just return */
- if (kprobe_disabled(kp))
- goto out;
-
- kp->flags |= KPROBE_FLAG_DISABLED;
- if (p != kp)
- /* When kp != p, p is always enabled. */
- try_to_disable_aggr_kprobe(p);
-
- if (!kprobes_all_disarmed && kprobe_disabled(p))
- disarm_kprobe(p);
-out:
- mutex_unlock(&kprobe_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(disable_kprobe);
-
-/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
-{
- int ret = 0;
- struct kprobe *p;
-
- mutex_lock(&kprobe_mutex);
-
- /* Check whether specified probe is valid. */
- p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
- ret = -EINVAL;
- goto out;
- }
-
- if (kprobe_gone(kp)) {
- /* This kprobe has gone, we couldn't enable it. */
- ret = -EINVAL;
- goto out;
- }
-
- if (p != kp)
- kp->flags &= ~KPROBE_FLAG_DISABLED;
-
- if (!kprobes_all_disarmed && kprobe_disabled(p)) {
- p->flags &= ~KPROBE_FLAG_DISABLED;
- arm_kprobe(p);
- }
-out:
- mutex_unlock(&kprobe_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(enable_kprobe);
-
static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41c..ec21304856d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
/*
* Various lockdep statistics:
*/
-atomic_t chain_lookup_hits;
-atomic_t chain_lookup_misses;
-atomic_t hardirqs_on_events;
-atomic_t hardirqs_off_events;
-atomic_t redundant_hardirqs_on;
-atomic_t redundant_hardirqs_off;
-atomic_t softirqs_on_events;
-atomic_t softirqs_off_events;
-atomic_t redundant_softirqs_on;
-atomic_t redundant_softirqs_off;
-atomic_t nr_unused_locks;
-atomic_t nr_cyclic_checks;
-atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_backwards_checks;
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
return NULL;
}
class = lock_classes + nr_lock_classes++;
- debug_atomic_inc(&nr_unused_locks);
+ debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
* Add a new dependency to the head of the list:
*/
static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
- struct list_head *head, unsigned long ip, int distance)
+ struct list_head *head, unsigned long ip,
+ int distance, struct stack_trace *trace)
{
struct lock_list *entry;
/*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
if (!entry)
return 0;
- if (!save_trace(&entry->trace))
- return 0;
-
entry->class = this;
entry->distance = distance;
+ entry->trace = *trace;
/*
* Since we never remove from the dependency list, the list can
* be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
{
int result;
- debug_atomic_inc(&nr_cyclic_checks);
+ debug_atomic_inc(nr_cyclic_checks);
result = __bfs_forwards(root, target, class_equal, target_entry);
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
{
int result;
- debug_atomic_inc(&nr_find_usage_forwards_checks);
+ debug_atomic_inc(nr_find_usage_forwards_checks);
result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
{
int result;
- debug_atomic_inc(&nr_find_usage_backwards_checks);
+ debug_atomic_inc(nr_find_usage_backwards_checks);
result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance)
+ struct held_lock *next, int distance, int trylock_loop)
{
struct lock_list *entry;
int ret;
struct lock_list this;
struct lock_list *uninitialized_var(target_entry);
+ /*
+ * Static variable, serialized by the graph_lock().
+ *
+ * We use this static variable to save the stack trace in case
+ * we call into this function multiple times due to encountering
+ * trylocks in the held lock stack.
+ */
+ static struct stack_trace trace;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
+ if (!trylock_loop && !save_trace(&trace))
+ return 0;
+
/*
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance);
+ next->acquire_ip, distance, &trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance);
+ next->acquire_ip, distance, &trace);
if (!ret)
return 0;
@@ -1731,6 +1728,7 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
+ int trylock_loop = 0;
struct held_lock *hlock;
/*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
* added:
*/
if (hlock->read != 2) {
- if (!check_prev_add(curr, hlock, next, distance))
+ if (!check_prev_add(curr, hlock, next,
+ distance, trylock_loop))
return 0;
/*
* Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
+ trylock_loop = 1;
}
return 1;
out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
list_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
- debug_atomic_inc(&chain_lookup_hits);
+ debug_atomic_inc(chain_lookup_hits);
if (very_verbose(class))
printk("\nhash chain already cached, key: "
"%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
chain_hlocks[chain->base + j] = class - lock_classes;
}
list_add_tail_rcu(&chain->entry, hash_head);
- debug_atomic_inc(&chain_lookup_misses);
+ debug_atomic_inc(chain_lookup_misses);
inc_chains();
return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
return;
if (unlikely(curr->hardirqs_enabled)) {
- debug_atomic_inc(&redundant_hardirqs_on);
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but loosing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
return;
}
/* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
curr->hardirq_enable_ip = ip;
curr->hardirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(&hardirqs_on_events);
+ debug_atomic_inc(hardirqs_on_events);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
curr->hardirqs_enabled = 0;
curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
- debug_atomic_inc(&hardirqs_off_events);
+ debug_atomic_inc(hardirqs_off_events);
} else
- debug_atomic_inc(&redundant_hardirqs_off);
+ debug_atomic_inc(redundant_hardirqs_off);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
return;
if (curr->softirqs_enabled) {
- debug_atomic_inc(&redundant_softirqs_on);
+ debug_atomic_inc(redundant_softirqs_on);
return;
}
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
curr->softirqs_enabled = 1;
curr->softirq_enable_ip = ip;
curr->softirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(&softirqs_on_events);
+ debug_atomic_inc(softirqs_on_events);
/*
* We are going to turn softirqs on, so set the
* usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
curr->softirqs_enabled = 0;
curr->softirq_disable_ip = ip;
curr->softirq_disable_event = ++curr->irq_events;
- debug_atomic_inc(&softirqs_off_events);
+ debug_atomic_inc(softirqs_off_events);
DEBUG_LOCKS_WARN_ON(!softirq_count());
} else
- debug_atomic_inc(&redundant_softirqs_off);
+ debug_atomic_inc(redundant_softirqs_off);
}
static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
break;
case LOCK_USED:
- debug_atomic_dec(&nr_unused_locks);
+ debug_atomic_dec(nr_unused_locks);
break;
default:
if (!debug_locks_off_graph_unlock())
@@ -2750,7 +2755,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!class)
return 0;
}
- debug_atomic_inc((atomic_t *)&class->ops);
+ atomic_inc((atomic_t *)&class->ops);
if (very_verbose(class)) {
printk("\nacquire class [%p] %s", class->key, class->name);
if (class->name_version > 1)
@@ -3227,7 +3232,7 @@ void lock_release(struct lockdep_map *lock, int nested,
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
- trace_lock_release(lock, nested, ip);
+ trace_lock_release(lock, ip);
__lock_release(lock, nested, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -3380,7 +3385,7 @@ found_it:
hlock->holdtime_stamp = now;
}
- trace_lock_acquired(lock, ip, waittime);
+ trace_lock_acquired(lock, ip);
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
@@ -3801,8 +3806,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
{
struct task_struct *curr = current;
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
if (!debug_locks_off())
return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+ /* Note: the following can be executed concurrently, so be careful. */
printk("\n===================================================\n");
printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad131..4f560cfedc8 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
#endif
#ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
/*
- * Various lockdep statistics:
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
*/
-extern atomic_t chain_lookup_hits;
-extern atomic_t chain_lookup_misses;
-extern atomic_t hardirqs_on_events;
-extern atomic_t hardirqs_off_events;
-extern atomic_t redundant_hardirqs_on;
-extern atomic_t redundant_hardirqs_off;
-extern atomic_t softirqs_on_events;
-extern atomic_t softirqs_off_events;
-extern atomic_t redundant_softirqs_on;
-extern atomic_t redundant_softirqs_off;
-extern atomic_t nr_unused_locks;
-extern atomic_t nr_cyclic_checks;
-extern atomic_t nr_cyclic_check_recursions;
-extern atomic_t nr_find_usage_forwards_checks;
-extern atomic_t nr_find_usage_forwards_recursions;
-extern atomic_t nr_find_usage_backwards_checks;
-extern atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr) atomic_inc(ptr)
-# define debug_atomic_dec(ptr) atomic_dec(ptr)
-# define debug_atomic_read(ptr) atomic_read(ptr)
+struct lockdep_stats {
+ int chain_lookup_hits;
+ int chain_lookup_misses;
+ int hardirqs_on_events;
+ int hardirqs_off_events;
+ int redundant_hardirqs_on;
+ int redundant_hardirqs_off;
+ int softirqs_on_events;
+ int softirqs_off_events;
+ int redundant_softirqs_on;
+ int redundant_softirqs_off;
+ int nr_unused_locks;
+ int nr_cyclic_checks;
+ int nr_cyclic_check_recursions;
+ int nr_find_usage_forwards_checks;
+ int nr_find_usage_forwards_recursions;
+ int nr_find_usage_backwards_checks;
+ int nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define __debug_atomic_inc(ptr) \
+ this_cpu_inc(lockdep_stats.ptr);
+
+#define debug_atomic_inc(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_inc(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_dec(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_dec(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_read(ptr) ({ \
+ struct lockdep_stats *__cpu_lockdep_stats; \
+ unsigned long long __total = 0; \
+ int __cpu; \
+ for_each_possible_cpu(__cpu) { \
+ __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
+ __total += __cpu_lockdep_stats->ptr; \
+ } \
+ __total; \
+})
#else
+# define __debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584..59b76c8ce9d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
static void lockdep_stats_debug_show(struct seq_file *m)
{
#ifdef CONFIG_DEBUG_LOCKDEP
- unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
- hi2 = debug_atomic_read(&hardirqs_off_events),
- hr1 = debug_atomic_read(&redundant_hardirqs_on),
- hr2 = debug_atomic_read(&redundant_hardirqs_off),
- si1 = debug_atomic_read(&softirqs_on_events),
- si2 = debug_atomic_read(&softirqs_off_events),
- sr1 = debug_atomic_read(&redundant_softirqs_on),
- sr2 = debug_atomic_read(&redundant_softirqs_off);
-
- seq_printf(m, " chain lookup misses: %11u\n",
- debug_atomic_read(&chain_lookup_misses));
- seq_printf(m, " chain lookup hits: %11u\n",
- debug_atomic_read(&chain_lookup_hits));
- seq_printf(m, " cyclic checks: %11u\n",
- debug_atomic_read(&nr_cyclic_checks));
- seq_printf(m, " find-mask forwards checks: %11u\n",
- debug_atomic_read(&nr_find_usage_forwards_checks));
- seq_printf(m, " find-mask backwards checks: %11u\n",
- debug_atomic_read(&nr_find_usage_backwards_checks));
-
- seq_printf(m, " hardirq on events: %11u\n", hi1);
- seq_printf(m, " hardirq off events: %11u\n", hi2);
- seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
- seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
- seq_printf(m, " softirq on events: %11u\n", si1);
- seq_printf(m, " softirq off events: %11u\n", si2);
- seq_printf(m, " redundant softirq ons: %11u\n", sr1);
- seq_printf(m, " redundant softirq offs: %11u\n", sr2);
+ unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+ hi2 = debug_atomic_read(hardirqs_off_events),
+ hr1 = debug_atomic_read(redundant_hardirqs_on),
+ hr2 = debug_atomic_read(redundant_hardirqs_off),
+ si1 = debug_atomic_read(softirqs_on_events),
+ si2 = debug_atomic_read(softirqs_off_events),
+ sr1 = debug_atomic_read(redundant_softirqs_on),
+ sr2 = debug_atomic_read(redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11llu\n",
+ debug_atomic_read(chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11llu\n",
+ debug_atomic_read(chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11llu\n",
+ debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " find-mask forwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask backwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_backwards_checks));
+
+ seq_printf(m, " hardirq on events: %11llu\n", hi1);
+ seq_printf(m, " hardirq off events: %11llu\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
+ seq_printf(m, " softirq on events: %11llu\n", si1);
+ seq_printf(m, " softirq off events: %11llu\n", si2);
+ seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
#endif
}
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
- DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index b8a1e313448..e2564580f3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -724,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- /* Create stop_machine threads since free_module relies on
- * a non-failing stop_machine call. */
- ret = stop_machine_create();
- if (ret)
- return ret;
-
- if (mutex_lock_interruptible(&module_mutex) != 0) {
- ret = -EINTR;
- goto out_stop;
- }
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
mod = find_module(name);
if (!mod) {
@@ -793,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
out:
mutex_unlock(&module_mutex);
-out_stop:
- stop_machine_destroy();
return ret;
}
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d1552d3c12..a4fa381db3c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
+#include <linux/hash.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
void __weak hw_perf_disable(void) { barrier(); }
void __weak hw_perf_enable(void) { barrier(); }
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- return 0;
-}
-
void __weak perf_event_print_debug(void) { }
static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
event->total_time_running = run_end - event->tstamp_running;
}
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ update_event_times(leader);
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ update_event_times(event);
+}
+
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *sibling, *tmp;
-
if (list_empty(&event->group_entry))
return;
ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader != event)
event->group_leader->nr_siblings--;
- update_event_times(event);
+ update_group_times(event);
/*
* If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+}
+
+static void
+perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event *sibling, *tmp;
/*
* If this was a group event with sibling events then
@@ -505,18 +514,6 @@ retry:
}
/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
-}
-
-/*
* Cross CPU call to disable a performance event
*/
static void __perf_event_disable(void *info)
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- struct perf_event *event, *partial_group;
+ struct perf_event *event, *partial_group = NULL;
+ const struct pmu *pmu = group_event->pmu;
+ bool txn = false;
int ret;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
- ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
- if (ret)
- return ret < 0 ? ret : 0;
+ /* Check if group transaction availabe */
+ if (pmu->start_txn)
+ txn = true;
+
+ if (txn)
+ pmu->start_txn(pmu);
if (event_sched_in(group_event, cpuctx, ctx))
return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
}
}
- return 0;
+ if (!txn)
+ return 0;
+
+ ret = pmu->commit_txn(pmu);
+ if (!ret) {
+ pmu->cancel_txn(pmu);
+ return 0;
+ }
group_error:
+ if (txn)
+ pmu->cancel_txn(pmu);
+
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx)
return;
+ perf_disable();
+
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
cpuctx->task_ctx = ctx;
+
+ perf_enable();
}
#define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ /*
+ * Remove from the PMU, can't get re-enabled since we got
+ * here because the last ref went.
+ */
+ perf_event_disable(event);
+
WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
- perf_event_remove_from_context(event);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_irq(&ctx->lock);
+ list_del_event(event, ctx);
+ perf_destroy_group(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
mutex_lock(&event->owner->perf_event_mutex);
@@ -2642,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
}
static const struct file_operations perf_fops = {
+ .llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
@@ -2792,6 +2830,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = cbs;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
+/*
* Output
*/
static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
.event_id = {
.header = {
.type = PERF_RECORD_MMAP,
- .misc = 0,
+ .misc = PERF_RECORD_MISC_USER,
/* .size */
},
/* .pid */
@@ -3961,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, nmi, data, regs);
}
-static int perf_swevent_is_counting(struct perf_event *event)
-{
- /*
- * The event is active, we're good!
- */
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- return 1;
-
- /*
- * The event is off/error, not counting.
- */
- if (event->state != PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- /*
- * The event is inactive, if the context is active
- * we're part of a group that didn't make it on the 'pmu',
- * not counting.
- */
- if (event->ctx->is_active)
- return 0;
-
- /*
- * We're inactive and the context is too, this means the
- * task is scheduled out, we're counting events that happen
- * to us, like migration events.
- */
- return 1;
-}
-
static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data);
@@ -4014,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- if (event->cpu != -1 && event->cpu != smp_processor_id())
- return 0;
-
- if (!perf_swevent_is_counting(event))
- return 0;
-
if (event->attr.type != type)
return 0;
@@ -4036,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event,
return 1;
}
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
- enum perf_type_id type,
- u32 event_id, u64 nr, int nmi,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
{
+ u64 val = event_id | (type << 32);
+
+ return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+ u64 hash;
+ struct swevent_hlist *hlist;
+
+ hash = swevent_hash(type, event_id);
+
+ hlist = rcu_dereference(ctx->swevent_hlist);
+ if (!hlist)
+ return NULL;
+
+ return &hlist->heads[hash];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+ u64 nr, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_cpu_context *cpuctx;
struct perf_event *event;
+ struct hlist_node *node;
+ struct hlist_head *head;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ cpuctx = &__get_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+
+ head = find_swevent_head(cpuctx, type, event_id);
+
+ if (!head)
+ goto end;
+
+ hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_add(event, nr, nmi, data, regs);
}
+end:
+ rcu_read_unlock();
}
int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx)
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
- u64 nr, int nmi,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
-
- cpuctx = &__get_cpu_var(perf_cpu_context);
- rcu_read_lock();
- perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
- nr, nmi, data, regs);
- /*
- * doesn't really matter which of the child contexts the
- * events ends up in.
- */
- ctx = rcu_dereference(current->perf_event_ctxp);
- if (ctx)
- perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
- rcu_read_unlock();
-}
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct pt_regs *regs, u64 addr)
@@ -4131,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event)
static int perf_swevent_enable(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
+ struct perf_cpu_context *cpuctx;
+ struct hlist_head *head;
+
+ cpuctx = &__get_cpu_var(perf_cpu_context);
if (hwc->sample_period) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
+
+ head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+ if (WARN_ON_ONCE(!head))
+ return -EINVAL;
+
+ hlist_add_head_rcu(&event->hlist_entry, head);
+
return 0;
}
static void perf_swevent_disable(struct perf_event *event)
{
+ hlist_del_rcu(&event->hlist_entry);
}
static const struct pmu perf_ops_generic = {
@@ -4168,15 +4217,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
perf_sample_data_init(&data, 0);
data.period = event->hw.last_period;
regs = get_irq_regs();
- /*
- * In case we exclude kernel IPs or are somehow not in interrupt
- * context, provide the next best thing, the user IP.
- */
- if ((event->attr.exclude_kernel || !regs) &&
- !event->attr.exclude_user)
- regs = task_pt_regs(current);
- if (regs) {
+ if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && current->pid == 0))
if (perf_event_overflow(event, 0, &data, regs))
ret = HRTIMER_NORESTART;
@@ -4324,6 +4366,105 @@ static const struct pmu perf_ops_task_clock = {
.read = task_clock_perf_event_read,
};
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+ struct swevent_hlist *hlist;
+
+ hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+ kfree(hlist);
+}
+
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+ struct swevent_hlist *hlist;
+
+ if (!cpuctx->swevent_hlist)
+ return;
+
+ hlist = cpuctx->swevent_hlist;
+ rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+ call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+ mutex_lock(&cpuctx->hlist_mutex);
+
+ if (!--cpuctx->hlist_refcount)
+ swevent_hlist_release(cpuctx);
+
+ mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+ int cpu;
+
+ if (event->cpu != -1) {
+ swevent_hlist_put_cpu(event, event->cpu);
+ return;
+ }
+
+ for_each_possible_cpu(cpu)
+ swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ int err = 0;
+
+ mutex_lock(&cpuctx->hlist_mutex);
+
+ if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ if (!hlist) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+ }
+ cpuctx->hlist_refcount++;
+ exit:
+ mutex_unlock(&cpuctx->hlist_mutex);
+
+ return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+ int err;
+ int cpu, failed_cpu;
+
+ if (event->cpu != -1)
+ return swevent_hlist_get_cpu(event, event->cpu);
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ err = swevent_hlist_get_cpu(event, cpu);
+ if (err) {
+ failed_cpu = cpu;
+ goto fail;
+ }
+ }
+ put_online_cpus();
+
+ return 0;
+ fail:
+ for_each_possible_cpu(cpu) {
+ if (cpu == failed_cpu)
+ break;
+ swevent_hlist_put_cpu(event, cpu);
+ }
+
+ put_online_cpus();
+ return err;
+}
+
#ifdef CONFIG_EVENT_TRACING
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4498,13 @@ static int perf_tp_event_match(struct perf_event *event,
static void tp_perf_event_destroy(struct perf_event *event)
{
perf_trace_disable(event->attr.config);
+ swevent_hlist_put(event);
}
static const struct pmu *tp_perf_event_init(struct perf_event *event)
{
+ int err;
+
/*
* Raw tracepoint data is a severe data leak, only allow root to
* have these.
@@ -4374,6 +4518,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
return NULL;
event->destroy = tp_perf_event_destroy;
+ err = swevent_hlist_get(event);
+ if (err) {
+ perf_trace_disable(event->attr.config);
+ return ERR_PTR(err);
+ }
return &perf_ops_generic;
}
@@ -4474,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
atomic_dec(&perf_swevent_enabled[event_id]);
+ swevent_hlist_put(event);
}
static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
case PERF_COUNT_SW_ALIGNMENT_FAULTS:
case PERF_COUNT_SW_EMULATION_FAULTS:
if (!event->parent) {
+ int err;
+
+ err = swevent_hlist_get(event);
+ if (err)
+ return ERR_PTR(err);
+
atomic_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
@@ -5176,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child)
*
* But since its the parent context it won't be the same instance.
*/
- mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock(&child_ctx->mutex);
again:
list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5540,7 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
cpuctx = &per_cpu(perf_cpu_context, cpu);
+ mutex_init(&cpuctx->hlist_mutex);
__perf_event_init_context(&cpuctx->ctx, NULL);
}
}
@@ -5397,6 +5554,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
spin_lock(&perf_resource_lock);
cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
spin_unlock(&perf_resource_lock);
+
+ mutex_lock(&cpuctx->hlist_mutex);
+ if (cpuctx->hlist_refcount > 0) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ WARN_ON_ONCE(!hlist);
+ rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+ }
+ mutex_unlock(&cpuctx->hlist_mutex);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu)
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
struct perf_event_context *ctx = &cpuctx->ctx;
+ mutex_lock(&cpuctx->hlist_mutex);
+ swevent_hlist_release(cpuctx);
+ mutex_unlock(&cpuctx->hlist_mutex);
+
mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae..dfadc5b729f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
return 0;
prof_buffer = vmalloc(buffer_bytes);
- if (prof_buffer)
+ if (prof_buffer) {
+ memset(prof_buffer, 0, buffer_bytes);
return 0;
+ }
free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a..6af9cdd558b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
- arch_ptrace_untrace(child);
if (task_is_traced(child))
ptrace_untrace(child);
}
@@ -666,10 +664,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
struct task_struct *child;
long ret;
- /*
- * This lock_kernel fixes a subtle race with suid exec
- */
- lock_kernel();
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
if (!ret)
@@ -703,7 +697,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
out_put_task_struct:
put_task_struct(child);
out:
- unlock_kernel();
return ret;
}
@@ -813,10 +806,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
struct task_struct *child;
long ret;
- /*
- * This lock_kernel fixes a subtle race with suid exec
- */
- lock_kernel();
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
goto out;
@@ -846,7 +835,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
out_put_task_struct:
put_task_struct(child);
out:
- unlock_kernel();
return ret;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49d808e833b..72a8dc9567f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/module.h>
-#include <linux/kernel_stat.h>
#include <linux/hardirq.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
#endif
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
- WARN_ON(num_online_cpus() != 1);
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
-}
-
-/*
* Awaken the corresponding synchronize_rcu() instance now that a
* grace period has elapsed.
*/
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572..38729d3cd23 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
};
/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
- .donetail = &rcu_ctrlblk.rcucblist,
- .curtail = &rcu_ctrlblk.rcucblist,
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
#ifdef CONFIG_NO_HZ
static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
*/
void rcu_sched_qs(int cpu)
{
- if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+ if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+ rcu_qsctr_help(&rcu_bh_ctrlblk))
raise_softirq(RCU_SOFTIRQ);
}
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
*/
static void rcu_process_callbacks(struct softirq_action *unused)
{
- __rcu_process_callbacks(&rcu_ctrlblk);
+ __rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
}
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*
* Cool, huh? (Due to Josh Triplett.)
*
- * But we want to make this a static inline later.
+ * But we want to make this a static inline later. The cond_resched()
+ * currently makes this problematic.
*/
void synchronize_sched(void)
{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
}
EXPORT_SYMBOL_GPL(synchronize_sched);
-void synchronize_rcu_bh(void)
-{
- synchronize_sched();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
/*
* Helper function for call_rcu() and call_rcu_bh().
*/
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
- __call_rcu(head, func, &rcu_ctrlblk);
+ __call_rcu(head, func, &rcu_sched_ctrlblk);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -244,11 +245,13 @@ void rcu_barrier(void)
{
struct rcu_synchronize rcu;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
{
struct rcu_synchronize rcu;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_bh(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
{
struct rcu_synchronize rcu;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_sched(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -280,3 +287,5 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
+
+#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 00000000000..d223a92bc74
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+#include <linux/kernel_stat.h>
+
+/*
+ * During boot, we forgive RCU lockdep issues. After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83e..6535ac8bc6a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
{
struct rcu_bh_torture_synchronize rcu;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
.sync = synchronize_sched_expedited,
.cb_barrier = NULL,
.fqs = rcu_sched_force_quiescent_state,
- .stats = rcu_expedited_torture_stats,
+ .stats = NULL,
.irq_capable = 1,
.name = "sched_expedited"
};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75..d4437345706 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/time.h>
+#include <linux/kernel_stat.h>
#include "rcutree.h"
@@ -53,8 +54,8 @@
static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-#define RCU_STATE_INITIALIZER(name) { \
- .level = { &name.node[0] }, \
+#define RCU_STATE_INITIALIZER(structname) { \
+ .level = { &structname.node[0] }, \
.levelcnt = { \
NUM_RCU_LVL_0, /* root of hierarchy. */ \
NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.signaled = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
- .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
+ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
.orphan_cbs_list = NULL, \
- .orphan_cbs_tail = &name.orphan_cbs_list, \
+ .orphan_cbs_tail = &structname.orphan_cbs_list, \
.orphan_qlen = 0, \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
+ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
+ .name = #structname, \
}
struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(int cpu)
{
- struct rcu_data *rdp;
+ struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
- rdp = &per_cpu(rcu_sched_data, cpu);
rdp->passed_quiesc_completed = rdp->gpnum - 1;
barrier();
rdp->passed_quiesc = 1;
- rcu_preempt_note_context_switch(cpu);
}
void rcu_bh_qs(int cpu)
{
- struct rcu_data *rdp;
+ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
- rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc_completed = rdp->gpnum - 1;
barrier();
rdp->passed_quiesc = 1;
}
+/*
+ * Note a context switch. This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ */
+void rcu_note_context_switch(int cpu)
+{
+ rcu_sched_qs(cpu);
+ rcu_preempt_note_context_switch(cpu);
+}
+
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_panicking __read_mostly;
+
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
/* OK, time to rat on our buddy... */
- printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+ rsp->name);
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
if (rnp->qsmask & (1UL << cpu))
printk(" %d", rnp->grplo + cpu);
}
- printk(" (detected by %d, t=%ld jiffies)\n",
+ printk("} (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start));
trigger_all_cpu_backtrace();
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
- printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
- smp_processor_id(), jiffies - rsp->gp_start);
+ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
+ rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
trigger_all_cpu_backtrace();
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
long delta;
struct rcu_node *rnp;
+ if (rcu_cpu_stall_panicking)
+ return;
delta = jiffies - rsp->jiffies_stall;
rnp = rdp->mynode;
if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
}
}
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_panicking = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static void __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+}
+
#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
}
+static void __init check_cpu_stall_init(void)
+{
+}
+
#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
*/
void rcu_check_callbacks(int cpu, int user)
{
- if (!rcu_pending(cpu))
- return; /* if nothing for RCU to do. */
if (user ||
(idle_cpu(cpu) && rcu_scheduler_active &&
!in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
rcu_bh_qs(cpu);
}
rcu_preempt_check_callbacks(cpu);
- raise_softirq(RCU_SOFTIRQ);
+ if (rcu_pending(cpu))
+ raise_softirq(RCU_SOFTIRQ);
}
#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
break; /* grace period idle or initializing, ignore. */
case RCU_SAVE_DYNTICK:
-
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
break; /* So gcc recognizes the dead code. */
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+
/* Record dyntick-idle state. */
force_qs_rnp(rsp, dyntick_save_progress_counter);
raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
if (rcu_blocking_is_gp())
return;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_sched(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
if (rcu_blocking_is_gp())
return;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_bh(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
check_cpu_stall(rsp, rdp);
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->qs_pending) {
+ if (rdp->qs_pending && !rdp->passed_quiesc) {
+
+ /*
+ * If force_quiescent_state() coming soon and this CPU
+ * needs a quiescent state, and this is either RCU-sched
+ * or RCU-bh, force a local reschedule.
+ */
rdp->n_rp_qs_pending++;
+ if (!rdp->preemptable &&
+ ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+ jiffies))
+ set_need_resched();
+ } else if (rdp->qs_pending && rdp->passed_quiesc) {
+ rdp->n_rp_report_qs++;
return 1;
}
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
}
/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process. Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system). After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections. This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+/*
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
*/
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
}
}
+
+ rnp = rsp->level[NUM_RCU_LVLS - 1];
+ for_each_possible_cpu(i) {
+ while (i > rnp->grphi)
+ rnp++;
+ rsp->rda[i]->mynode = rnp;
+ rcu_boot_init_percpu_data(i, rsp);
+ }
}
/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
#define RCU_INIT_FLAVOR(rsp, rcu_data) \
do { \
int i; \
- int j; \
- struct rcu_node *rnp; \
\
- rcu_init_one(rsp); \
- rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
- j = 0; \
for_each_possible_cpu(i) { \
- if (i > rnp[j].grphi) \
- j++; \
- per_cpu(rcu_data, i).mynode = &rnp[j]; \
(rsp)->rda[i] = &per_cpu(rcu_data, i); \
- rcu_boot_init_percpu_data(i, rsp); \
} \
+ rcu_init_one(rsp); \
} while (0)
void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
int cpu;
rcu_bootup_announce();
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
- printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#if NUM_RCU_LVL_4 != 0
- printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
-#endif /* #if NUM_RCU_LVL_4 != 0 */
RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
__rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
cpu_notifier(rcu_cpu_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ check_cpu_stall_init();
}
#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08..14c040b18ed 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
unsigned long n_rp_qs_pending;
+ unsigned long n_rp_report_qs;
unsigned long n_rp_cb_ready;
unsigned long n_rp_cpu_needs_gp;
unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+ char *name; /* Name of structure. */
};
/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda894..0e4f420245d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
#include <linux/delay.h>
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary. If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+ printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+ printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+ printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ printk(KERN_INFO
+ "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+ printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+ printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+ printk(KERN_INFO
+ "\tRCU-based detection of stalled CPUs is disabled.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+ printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+ printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+#endif
+}
+
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
*/
static void __init rcu_bootup_announce(void)
{
- printk(KERN_INFO
- "Experimental preemptable hierarchical RCU implementation.\n");
+ printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
}
/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
*/
static void rcu_preempt_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+
rdp->passed_quiesc_completed = rdp->gpnum - 1;
barrier();
rdp->passed_quiesc = 1;
+ current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
}
/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
- rcu_preempt_qs(cpu);
local_irq_save(flags);
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ rcu_preempt_qs(cpu);
local_irq_restore(flags);
}
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
*/
special = t->rcu_read_unlock_special;
if (special & RCU_READ_UNLOCK_NEED_QS) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
rcu_preempt_qs(smp_processor_id());
}
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
struct task_struct *t = current;
if (t->rcu_read_lock_nesting == 0) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
rcu_preempt_qs(cpu);
return;
}
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
if (!rcu_scheduler_active)
return;
+ init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -754,6 +798,7 @@ void exit_rcu(void)
static void __init rcu_bootup_announce(void)
{
printk(KERN_INFO "Hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
}
/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
int rcu_needs_cpu(int cpu)
{
int c = 0;
+ int snap;
+ int snap_nmi;
int thatcpu;
/* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
return rcu_needs_cpu_quick_check(cpu);
/* Don't bother unless we are the last non-dyntick-idle CPU. */
- for_each_cpu_not(thatcpu, nohz_cpu_mask)
- if (thatcpu != cpu) {
+ for_each_online_cpu(thatcpu) {
+ if (thatcpu == cpu)
+ continue;
+ snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+ snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+ smp_mb(); /* Order sampling of snap with end of grace period. */
+ if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
return rcu_needs_cpu_quick_check(cpu);
}
+ }
/* Check and update the rcu_dyntick_drain sequencing. */
if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d2..36c95b45738 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
{
seq_printf(m, "%3d%cnp=%ld "
- "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
+ "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->n_rcu_pending,
rdp->n_rp_qs_pending,
+ rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp,
rdp->n_rp_gp_completed,
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ff..1d93cd0ae4d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
-#include <linux/kthread.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ
+ u64 nohz_stamp;
unsigned char in_nohz_recently;
#endif
+ unsigned int skip_clock_update;
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
int post_schedule;
int active_balance;
int push_cpu;
+ struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
unsigned long avg_load_per_task;
- struct task_struct *migration_thread;
- struct list_head migration_queue;
-
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (test_tsk_need_resched(p))
+ rq->skip_clock_update = 1;
}
static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
inline void update_rq_clock(struct rq *rq)
{
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update)
+ rq->clock = sched_clock_cpu(cpu_of(rq));
}
/*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
*/
static inline int task_is_waking(struct task_struct *p)
{
- return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+ return unlikely(p->state == TASK_WAKING);
}
/*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
@@ -957,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
local_irq_save(*flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
@@ -1239,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
+
+int nohz_ratelimit(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 diff = rq->clock - rq->nohz_stamp;
+
+ rq->nohz_stamp = rq->clock;
+
+ return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1781,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
- update_rq_clock(rq1);
- update_rq_clock(rq2);
}
/*
@@ -1813,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@@ -1870,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}
-static void update_avg(u64 *avg, u64 sample)
-{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
-}
-
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (wakeup)
- p->se.start_runtime = p->se.sum_exec_runtime;
-
+ update_rq_clock(rq);
sched_info_queued(p);
- p->sched_class->enqueue_task(rq, p, wakeup, head);
+ p->sched_class->enqueue_task(rq, p, flags);
p->se.on_rq = 1;
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (sleep) {
- if (p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
- } else {
- update_avg(&p->se.avg_wakeup,
- sysctl_sched_wakeup_granularity);
- }
- }
-
+ update_rq_clock(rq);
sched_info_dequeued(p);
- p->sched_class->dequeue_task(rq, p, sleep);
+ p->sched_class->dequeue_task(rq, p, flags);
p->se.on_rq = 0;
}
/*
* activate_task - move a task to the runqueue.
*/
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
- enqueue_task(rq, p, wakeup, false);
+ enqueue_task(rq, p, flags);
inc_nr_running(rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
- dequeue_task(rq, p, sleep);
+ dequeue_task(rq, p, flags);
dec_nr_running(rq);
}
@@ -2054,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
-struct migration_req {
- struct list_head list;
-
+struct migration_arg {
struct task_struct *task;
int dest_cpu;
-
- struct completion done;
};
+static int migration_cpu_stop(void *data);
+
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
{
struct rq *rq = task_rq(p);
@@ -2076,58 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
*/
- if (!p->se.on_rq && !task_running(rq, p))
- return 0;
-
- init_completion(&req->done);
- req->task = p;
- req->dest_cpu = dest_cpu;
- list_add(&req->list, &rq->migration_queue);
-
- return 1;
-}
-
-/*
- * wait_task_context_switch - wait for a thread to complete at least one
- * context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
- unsigned long nvcsw, nivcsw, flags;
- int running;
- struct rq *rq;
-
- nvcsw = p->nvcsw;
- nivcsw = p->nivcsw;
- for (;;) {
- /*
- * The runqueue is assigned before the actual context
- * switch. We need to take the runqueue lock.
- *
- * We could check initially without the lock but it is
- * very likely that we need to take the lock in every
- * iteration.
- */
- rq = task_rq_lock(p, &flags);
- running = task_running(rq, p);
- task_rq_unlock(rq, &flags);
-
- if (likely(!running))
- break;
- /*
- * The switch count is incremented before the actual
- * context switch. We thus wait for two switches to be
- * sure at least one completed.
- */
- if ((p->nvcsw - nvcsw) > 1)
- break;
- if ((p->nivcsw - nivcsw) > 1)
- break;
-
- cpu_relax();
- }
+ return p->se.on_rq || task_running(rq, p);
}
/*
@@ -2185,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
- trace_sched_wait_task(rq, p);
+ trace_sched_wait_task(p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
@@ -2283,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
}
#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int dest_cpu;
@@ -2299,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- rcu_read_lock();
- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
- rcu_read_unlock();
- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
@@ -2321,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
/*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- * exec: is unstable, retry loop
- * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2349,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
return cpu;
}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
#endif
/***
@@ -2370,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
{
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
+ unsigned long en_flags = ENQUEUE_WAKEUP;
struct rq *rq;
- if (!sched_feat(SYNC_WAKEUPS))
- wake_flags &= ~WF_SYNC;
-
this_cpu = get_cpu();
smp_wmb();
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
if (!(p->state & state))
goto out;
@@ -2399,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
*
* First fix up the nr_uninterruptible count:
*/
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ if (task_contributes_to_load(p)) {
+ if (likely(cpu_online(orig_cpu)))
+ rq->nr_uninterruptible--;
+ else
+ this_rq()->nr_uninterruptible--;
+ }
p->state = TASK_WAKING;
- if (p->sched_class->task_waking)
+ if (p->sched_class->task_waking) {
p->sched_class->task_waking(rq, p);
+ en_flags |= ENQUEUE_WAKING;
+ }
- __task_rq_unlock(rq);
-
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu) {
- /*
- * Since we migrate the task without holding any rq->lock,
- * we need to be careful with task_rq_lock(), since that
- * might end up locking an invalid rq.
- */
+ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+ if (cpu != orig_cpu)
set_task_cpu(p, cpu);
- }
+ __task_rq_unlock(rq);
rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
/*
* We migrated the task without holding either rq->lock, however
@@ -2448,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.nr_wakeups);
+ schedstat_inc(p, se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.nr_wakeups_sync);
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
if (orig_cpu != cpu)
- schedstat_inc(p, se.nr_wakeups_migrate);
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
if (cpu == this_cpu)
- schedstat_inc(p, se.nr_wakeups_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
else
- schedstat_inc(p, se.nr_wakeups_remote);
- activate_task(rq, p, 1);
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ activate_task(rq, p, en_flags);
success = 1;
- /*
- * Only attribute actual wakeups done by this task.
- */
- if (!in_interrupt()) {
- struct sched_entity *se = &current->se;
- u64 sample = se->sum_exec_runtime;
-
- if (se->last_wakeup)
- sample -= se->last_wakeup;
- else
- sample -= se->start_runtime;
- update_avg(&se->avg_wakeup, sample);
-
- se->last_wakeup = se->sum_exec_runtime;
- }
-
out_running:
- trace_sched_wakeup(rq, p, success);
+ trace_sched_wakeup(p, success);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2537,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
- p->se.last_wakeup = 0;
- p->se.avg_overlap = 0;
- p->se.start_runtime = 0;
- p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.wait_max = 0;
- p->se.wait_count = 0;
- p->se.wait_sum = 0;
-
- p->se.sleep_start = 0;
- p->se.sleep_max = 0;
- p->se.sum_sleep_runtime = 0;
-
- p->se.block_start = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
-
- p->se.nr_migrations_cold = 0;
- p->se.nr_failed_migrations_affine = 0;
- p->se.nr_failed_migrations_running = 0;
- p->se.nr_failed_migrations_hot = 0;
- p->se.nr_forced_migrations = 0;
-
- p->se.nr_wakeups = 0;
- p->se.nr_wakeups_sync = 0;
- p->se.nr_wakeups_migrate = 0;
- p->se.nr_wakeups_local = 0;
- p->se.nr_wakeups_remote = 0;
- p->se.nr_wakeups_affine = 0;
- p->se.nr_wakeups_affine_attempts = 0;
- p->se.nr_wakeups_passive = 0;
- p->se.nr_wakeups_idle = 0;
-
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2593,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
__sched_fork(p);
/*
- * We mark the process as waking here. This guarantees that
+ * We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_WAKING;
+ p->state = TASK_RUNNING;
/*
* Revert to default priority/policy on fork if requested.
@@ -2664,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
int cpu __maybe_unused = get_cpu();
#ifdef CONFIG_SMP
+ rq = task_rq_lock(p, &flags);
+ p->state = TASK_WAKING;
+
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*
- * We still have TASK_WAKING but PF_STARTING is gone now, meaning
- * ->cpus_allowed is stable, we have preemption disabled, meaning
- * cpu_online_mask is stable.
+ * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+ * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
set_task_cpu(p, cpu);
-#endif
-
- /*
- * Since the task is not on the rq and we still have TASK_WAKING set
- * nobody else will migrate this task.
- */
- rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
- BUG_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
- update_rq_clock(rq);
+ task_rq_unlock(rq, &flags);
+#endif
+
+ rq = task_rq_lock(p, &flags);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(rq, p, 1);
+ trace_sched_wakeup_new(p, 1);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
@@ -2908,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(rq, prev, next);
+ trace_sched_switch(prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3025,6 +2904,61 @@ static unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
+static long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+ long delta;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+ long delta = 0;
+
+ /*
+ * Its got a race, we don't care...
+ */
+ if (atomic_long_read(&calc_load_tasks_idle))
+ delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+ return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+ return 0;
+}
+#endif
+
/**
* get_avenrun - get the load average array
* @loads: pointer to dest load array
@@ -3071,20 +3005,22 @@ void calc_global_load(void)
}
/*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
*/
static void calc_load_account_active(struct rq *this_rq)
{
- long nr_active, delta;
+ long delta;
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
+ delta = calc_load_fold_active(this_rq);
+ delta += calc_load_fold_idle();
+ if (delta)
atomic_long_add(delta, &calc_load_tasks);
- }
+
+ this_rq->calc_load_update += LOAD_FREQ;
}
/*
@@ -3116,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
- if (time_after_eq(jiffies, this_rq->calc_load_update)) {
- this_rq->calc_load_update += LOAD_FREQ;
- calc_load_account_active(this_rq);
- }
+ calc_load_account_active(this_rq);
}
#ifdef CONFIG_SMP
@@ -3131,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
void sched_exec(void)
{
struct task_struct *p = current;
- struct migration_req req;
- int dest_cpu, this_cpu;
unsigned long flags;
struct rq *rq;
-
-again:
- this_cpu = get_cpu();
- dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
- if (dest_cpu == this_cpu) {
- put_cpu();
- return;
- }
+ int dest_cpu;
rq = task_rq_lock(p, &flags);
- put_cpu();
+ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ if (dest_cpu == smp_processor_id())
+ goto unlock;
/*
* select_task_rq() can race against ->cpus_allowed
*/
- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
- || unlikely(!cpu_active(dest_cpu))) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+ likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ struct migration_arg arg = { p, dest_cpu };
- /* force the process onto the specified CPU */
- if (migrate_task(p, dest_cpu, &req)) {
- /* Need to wait for migration thread (might exit: take ref). */
- struct task_struct *mt = rq->migration_thread;
-
- get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(mt);
- put_task_struct(mt);
- wait_for_completion(&req.done);
-
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
return;
}
+unlock:
task_rq_unlock(rq, &flags);
}
@@ -3640,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->state == TASK_RUNNING) {
- u64 runtime = prev->se.sum_exec_runtime;
-
- runtime -= prev->se.prev_sum_exec_runtime;
- runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
- /*
- * In order to avoid avg_overlap growing stale when we are
- * indeed overlapping and hence not getting put to sleep, grow
- * the avg_overlap on preemption.
- *
- * We use the average preemption runtime because that
- * correlates to the amount of cache footprint a task can
- * build up.
- */
- update_avg(&prev->se.avg_overlap, runtime);
- }
+ if (prev->se.on_rq)
+ update_rq_clock(rq);
+ rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3706,7 +3608,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_sched_qs(cpu);
+ rcu_note_context_switch(cpu);
prev = rq->curr;
switch_count = &prev->nivcsw;
@@ -3719,14 +3621,13 @@ need_resched_nonpreemptible:
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
else
- deactivate_task(rq, prev, 1);
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
switch_count = &prev->nvcsw;
}
@@ -4049,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(&x->wait, &wait);
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
@@ -4276,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
oldprio = p->prio;
prev_class = p->sched_class;
@@ -4297,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq) {
- enqueue_task(rq, p, 0, oldprio < prio);
+ enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
check_class_changed(rq, p, prev_class, oldprio, running);
}
@@ -4319,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4341,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (on_rq) {
- enqueue_task(rq, p, 0, false);
+ enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4602,7 +4500,6 @@ recheck:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -5339,17 +5236,15 @@ static inline void sched_init_granularity(void)
/*
* This is how migration works:
*
- * 1) we queue a struct migration_req structure in the source CPU's
- * runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- * thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- * task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
* it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
*/
/*
@@ -5363,12 +5258,23 @@ static inline void sched_init_granularity(void)
*/
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- struct migration_req req;
unsigned long flags;
struct rq *rq;
+ unsigned int dest_cpu;
int ret = 0;
+ /*
+ * Serialize against TASK_WAKING so that ttwu() and wunt() can
+ * drop the rq->lock and still rely on ->cpus_allowed.
+ */
+again:
+ while (task_is_waking(p))
+ cpu_relax();
rq = task_rq_lock(p, &flags);
+ if (task_is_waking(p)) {
+ task_rq_unlock(rq, &flags);
+ goto again;
+ }
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
@@ -5392,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (migrate_task(p, dest_cpu)) {
+ struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- struct task_struct *mt = rq->migration_thread;
-
- get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(mt);
- put_task_struct(mt);
- wait_for_completion(&req.done);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
@@ -5458,98 +5361,49 @@ fail:
return ret;
}
-#define RCU_MIGRATION_IDLE 0
-#define RCU_MIGRATION_NEED_QS 1
-#define RCU_MIGRATION_GOT_QS 2
-#define RCU_MIGRATION_MUST_SYNC 3
-
/*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
*/
-static int migration_thread(void *data)
-{
- int badcpu;
- int cpu = (long)data;
- struct rq *rq;
-
- rq = cpu_rq(cpu);
- BUG_ON(rq->migration_thread != current);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- struct migration_req *req;
- struct list_head *head;
-
- raw_spin_lock_irq(&rq->lock);
-
- if (cpu_is_offline(cpu)) {
- raw_spin_unlock_irq(&rq->lock);
- break;
- }
-
- if (rq->active_balance) {
- active_load_balance(rq, cpu);
- rq->active_balance = 0;
- }
-
- head = &rq->migration_queue;
-
- if (list_empty(head)) {
- raw_spin_unlock_irq(&rq->lock);
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- continue;
- }
- req = list_entry(head->next, struct migration_req, list);
- list_del_init(head->next);
-
- if (req->task != NULL) {
- raw_spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
- } else if (likely(cpu == (badcpu = smp_processor_id()))) {
- req->dest_cpu = RCU_MIGRATION_GOT_QS;
- raw_spin_unlock(&rq->lock);
- } else {
- req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
- raw_spin_unlock(&rq->lock);
- WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
- }
- local_irq_enable();
-
- complete(&req->done);
- }
- __set_current_state(TASK_RUNNING);
-
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
+static int migration_cpu_stop(void *data)
{
- int ret;
+ struct migration_arg *arg = data;
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
local_irq_disable();
- ret = __migrate_task(p, src_cpu, dest_cpu);
+ __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
local_irq_enable();
- return ret;
+ return 0;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Figure out where task on dead CPU should go, use force if necessary.
*/
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- int dest_cpu;
+ struct rq *rq = cpu_rq(dead_cpu);
+ int needs_cpu, uninitialized_var(dest_cpu);
+ unsigned long flags;
-again:
- dest_cpu = select_fallback_rq(dead_cpu, p);
+ local_irq_save(flags);
- /* It can have affinity changed while we were choosing. */
- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
- goto again;
+ raw_spin_lock(&rq->lock);
+ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+ if (needs_cpu)
+ dest_cpu = select_fallback_rq(dead_cpu, p);
+ raw_spin_unlock(&rq->lock);
+ /*
+ * It can only fail if we race with set_cpus_allowed(),
+ * in the racer should migrate the task anyway.
+ */
+ if (needs_cpu)
+ __migrate_task(p, dead_cpu, dest_cpu);
+ local_irq_restore(flags);
}
/*
@@ -5613,7 +5467,6 @@ void sched_idle_next(void)
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- update_rq_clock(rq);
activate_task(rq, p, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5668,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
for ( ; ; ) {
if (!rq->nr_running)
break;
- update_rq_clock(rq);
next = pick_next_task(rq);
if (!next)
break;
@@ -5891,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
static int __cpuinit
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- struct task_struct *p;
int cpu = (long)hcpu;
unsigned long flags;
- struct rq *rq;
+ struct rq *rq = cpu_rq(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
- if (IS_ERR(p))
- return NOTIFY_BAD;
- kthread_bind(p, cpu);
- /* Must be high prio: stop_machine expects to yield to it. */
- rq = task_rq_lock(p, &flags);
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- task_rq_unlock(rq, &flags);
- get_task_struct(p);
- cpu_rq(cpu)->migration_thread = p;
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- /* Strictly unnecessary, as first user will wake it. */
- wake_up_process(cpu_rq(cpu)->migration_thread);
-
/* Update our root-domain */
- rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5930,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!cpu_rq(cpu)->migration_thread)
- break;
- /* Unbind it from offline cpu so it can run. Fall thru. */
- kthread_bind(cpu_rq(cpu)->migration_thread,
- cpumask_any(cpu_online_mask));
- kthread_stop(cpu_rq(cpu)->migration_thread);
- put_task_struct(cpu_rq(cpu)->migration_thread);
- cpu_rq(cpu)->migration_thread = NULL;
- break;
-
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
- rq = cpu_rq(cpu);
- kthread_stop(rq->migration_thread);
- put_task_struct(rq->migration_thread);
- rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
deactivate_task(rq, rq->idle, 0);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
raw_spin_unlock_irq(&rq->lock);
- cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
- /*
- * No need to migrate the tasks: it was best-effort if
- * they didn't take sched_hotcpu_mutex. Just wake up
- * the requestors.
- */
- raw_spin_lock_irq(&rq->lock);
- while (!list_empty(&rq->migration_queue)) {
- struct migration_req *req;
-
- req = list_entry(rq->migration_queue.next,
- struct migration_req, list);
- list_del_init(&req->list);
- raw_spin_unlock_irq(&rq->lock);
- complete(&req->done);
- raw_spin_lock_irq(&rq->lock);
- }
- raw_spin_unlock_irq(&rq->lock);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
/* Update our root-domain */
- rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6315,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
+ for (tmp = sd; tmp; tmp = tmp->parent)
+ tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -7798,10 +7601,8 @@ void __init sched_init(void)
rq->push_cpu = 0;
rq->cpu = i;
rq->online = 0;
- rq->migration_thread = NULL;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- INIT_LIST_HEAD(&rq->migration_queue);
rq_attach_root(rq, &def_root_domain);
#endif
init_rq_hrtick(rq);
@@ -7902,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
{
int on_rq;
- update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
@@ -7929,9 +7729,9 @@ void normalize_rt_tasks(void)
p->se.exec_start = 0;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
+ p->se.statistics.wait_start = 0;
+ p->se.statistics.sleep_start = 0;
+ p->se.statistics.block_start = 0;
#endif
if (!rt_task(p)) {
@@ -8264,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags);
- update_rq_clock(rq);
-
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
@@ -8284,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (on_rq)
- enqueue_task(rq, tsk, 0, false);
+ enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, &flags);
}
@@ -9098,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
#ifndef CONFIG_SMP
-int rcu_expedited_torture_stats(char *page)
-{
- return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
void synchronize_sched_expedited(void)
{
+ barrier();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#else /* #ifndef CONFIG_SMP */
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
{
- int cnt = 0;
- int cpu;
-
- cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
- for_each_online_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d:%d",
- cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
- }
- cnt += sprintf(&page[cnt], "\n");
- return cnt;
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
/*
* Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9148,18 +8935,14 @@ static long synchronize_sched_expedited_count;
*/
void synchronize_sched_expedited(void)
{
- int cpu;
- unsigned long flags;
- bool need_full_sync = 0;
- struct rq *rq;
- struct migration_req *req;
- long snap;
- int trycount = 0;
+ int snap, trycount = 0;
smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ snap = atomic_read(&synchronize_sched_expedited_count) + 1;
get_online_cpus();
- while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
put_online_cpus();
if (trycount++ < 10)
udelay(trycount * num_online_cpus());
@@ -9167,41 +8950,15 @@ void synchronize_sched_expedited(void)
synchronize_sched();
return;
}
- if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
smp_mb(); /* ensure test happens before caller kfree */
return;
}
get_online_cpus();
}
- rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
- for_each_online_cpu(cpu) {
- rq = cpu_rq(cpu);
- req = &per_cpu(rcu_migration_req, cpu);
- init_completion(&req->done);
- req->task = NULL;
- req->dest_cpu = RCU_MIGRATION_NEED_QS;
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_add(&req->list, &rq->migration_queue);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- wake_up_process(rq->migration_thread);
- }
- for_each_online_cpu(cpu) {
- rcu_expedited_state = cpu;
- req = &per_cpu(rcu_migration_req, cpu);
- rq = cpu_rq(cpu);
- wait_for_completion(&req->done);
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
- need_full_sync = 1;
- req->dest_cpu = RCU_MIGRATION_IDLE;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
- synchronize_sched_expedited_count++;
- mutex_unlock(&rcu_sched_expedited_mutex);
+ atomic_inc(&synchronize_sched_expedited_count);
+ smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
put_online_cpus();
- if (need_full_sync)
- synchronize_sched();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 19be00ba612..87a330a7185 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->wait_start);
- PN(se->sleep_start);
- PN(se->block_start);
- PN(se->sleep_max);
- PN(se->block_max);
- PN(se->exec_max);
- PN(se->slice_max);
- PN(se->wait_max);
- PN(se->wait_sum);
- P(se->wait_count);
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
#endif
P(se->load.weight);
#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(p->se.vruntime),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.sum_sleep_runtime));
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -175,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
- {
- uid_t uid = cfs_rq->tg->uid;
- SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
- }
#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#endif
@@ -409,40 +404,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
- PN(se.avg_overlap);
- PN(se.avg_wakeup);
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.wait_start);
- PN(se.sleep_start);
- PN(se.block_start);
- PN(se.sleep_max);
- PN(se.block_max);
- PN(se.exec_max);
- PN(se.slice_max);
- PN(se.wait_max);
- PN(se.wait_sum);
- P(se.wait_count);
- PN(se.iowait_sum);
- P(se.iowait_count);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
- P(se.nr_migrations_cold);
- P(se.nr_failed_migrations_affine);
- P(se.nr_failed_migrations_running);
- P(se.nr_failed_migrations_hot);
- P(se.nr_forced_migrations);
- P(se.nr_wakeups);
- P(se.nr_wakeups_sync);
- P(se.nr_wakeups_migrate);
- P(se.nr_wakeups_local);
- P(se.nr_wakeups_remote);
- P(se.nr_wakeups_affine);
- P(se.nr_wakeups_affine_attempts);
- P(se.nr_wakeups_passive);
- P(se.nr_wakeups_idle);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
{
u64 avg_atom, avg_per_cpu;
@@ -493,31 +486,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_max = 0;
- p->se.wait_sum = 0;
- p->se.wait_count = 0;
- p->se.iowait_sum = 0;
- p->se.iowait_count = 0;
- p->se.sleep_max = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.nr_migrations = 0;
- p->se.nr_migrations_cold = 0;
- p->se.nr_failed_migrations_affine = 0;
- p->se.nr_failed_migrations_running = 0;
- p->se.nr_failed_migrations_hot = 0;
- p->se.nr_forced_migrations = 0;
- p->se.nr_wakeups = 0;
- p->se.nr_wakeups_sync = 0;
- p->se.nr_wakeups_migrate = 0;
- p->se.nr_wakeups_local = 0;
- p->se.nr_wakeups_remote = 0;
- p->se.nr_wakeups_affine = 0;
- p->se.nr_wakeups_affine_attempts = 0;
- p->se.nr_wakeups_passive = 0;
- p->se.nr_wakeups_idle = 0;
- p->sched_info.bkl_count = 0;
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924..217e4a9393e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
{
unsigned long delta_exec_weighted;
- schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ schedstat_set(curr->statistics.exec_max,
+ max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_max, max(se->wait_max,
- rq_of(cfs_rq)->clock - se->wait_start));
- schedstat_set(se->wait_count, se->wait_count + 1);
- schedstat_set(se->wait_sum, se->wait_sum +
- rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->wait_start);
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
- schedstat_set(se->wait_start, 0);
+ schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (entity_is_task(se))
tsk = task_of(se);
- if (se->sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->sleep_max))
- se->sleep_max = delta;
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
- se->sleep_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
- if (se->block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ if (se->statistics.block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->block_max))
- se->block_max = delta;
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
- se->block_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
- se->iowait_sum += delta;
- se->iowait_count++;
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
- * Convert the sleeper threshold into virtual time.
- * SCHED_IDLE is a special sub-class. We care about
- * fairness only relative to other SCHED_IDLE tasks,
- * all of which have the same weight.
- */
- if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- task_of(se)->policy != SCHED_IDLE))
- thresh = calc_delta_fair(thresh, se);
-
- /*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
se->vruntime = vruntime;
}
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
- if (sleep) {
+ if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
- if (!sleep)
+ if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
}
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->slice_max = max(se->slice_max,
+ se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
* then put the task into the rbtree:
*/
static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int flags = 0;
-
- if (wakeup)
- flags |= ENQUEUE_WAKEUP;
- if (p->state == TASK_WAKING)
- flags |= ENQUEUE_MIGRATE;
for_each_sched_entity(se) {
if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, sleep);
+ dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
- sleep = 1;
+ flags |= DEQUEUE_SLEEP;
}
hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (sync) {
- if (sched_feat(SYNC_LESS) &&
- (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
- } else {
- if (sched_feat(SYNC_MORE) &&
- (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost))
- sync = 1;
- }
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (sync && balanced)
return 1;
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
+ struct sched_domain *sd;
int i;
/*
- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
- * test in select_task_rq_fair) and the prev_cpu is idle then that's
- * always a better target than the current cpu.
+ * If the task is going to be woken-up on this cpu and if it is
+ * already idle, then it is the right target.
*/
- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+ if (target == cpu && idle_cpu(cpu))
+ return cpu;
+
+ /*
+ * If the task is going to be woken-up on the cpu where it previously
+ * ran and if it is currently idle, then it the right target.
+ */
+ if (target == prev_cpu && idle_cpu(prev_cpu))
return prev_cpu;
/*
- * Otherwise, iterate the domain and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
- if (!cpu_rq(i)->cfs.nr_running) {
- target = i;
+ for_each_domain(target, sd) {
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
+
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
}
+
+ /*
+ * Lets stop looking for an idle sibling when we reached
+ * the domain that spans the current cpu and prev_cpu.
+ */
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ break;
}
return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
*
* preempt must be disabled.
*/
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}
/*
- * While iterating the domains looking for a spanning
- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
- * in cache sharing domains along the way.
+ * If both cpu and prev_cpu are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
*/
- if (want_affine) {
- int target = -1;
-
- /*
- * If both cpu and prev_cpu are part of this domain,
- * cpu is a valid SD_WAKE_AFFINE target.
- */
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
- target = cpu;
-
- /*
- * If there's an idle sibling in this domain, make that
- * the wake_affine target instead of the current cpu.
- */
- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
- target = select_idle_sibling(p, tmp, target);
-
- if (target >= 0) {
- if (tmp->flags & SD_WAKE_AFFINE) {
- affine_sd = tmp;
- want_affine = 0;
- }
- cpu = target;
- }
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ affine_sd = tmp;
+ want_affine = 0;
}
if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
sd = tmp;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
- if (affine_sd && (!tmp ||
- cpumask_weight(sched_domain_span(affine_sd)) >
- cpumask_weight(sched_domain_span(sd))))
+ if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
tmp = affine_sd;
- if (tmp)
+ if (tmp) {
+ raw_spin_unlock(&rq->lock);
update_shares(tmp);
+ raw_spin_lock(&rq->lock);
+ }
}
+#endif
- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ return select_idle_sibling(p, cpu);
+ else
+ return select_idle_sibling(p, prev_cpu);
+ }
while (sd) {
int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
- weight = cpumask_weight(sched_domain_span(sd));
+ weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
- if (weight <= cpumask_weight(sched_domain_span(tmp)))
+ if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}
#endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- * degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
- u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- u64 gran = 0;
-
- if (this_run < expected_wakeup)
- gran = expected_wakeup - this_run;
-
- return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
- if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- gran = adaptive_gran(curr, se);
-
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
return gran;
}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int sync = wake_flags & WF_SYNC;
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
- if (sched_feat(WAKEUP_SYNC) && sync)
- goto preempt;
-
- if (sched_feat(WAKEUP_OVERLAP) &&
- se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost)
- goto preempt;
-
if (!sched_feat(WAKEUP_PREEMPT))
return;
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
+static int active_load_balance_cpu_stop(void *data);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
if (need_active_balance(sd, sd_idle, idle)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
- /* don't kick the migration_thread, if the curr
- * task on busiest cpu can't be moved to this_cpu
+ /* don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest cpu can't be
+ * moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
goto out_one_pinned;
}
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
/*
* We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
}
/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
*/
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
- struct rq *target_rq;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- return;
-
- target_rq = cpu_rq(target_cpu);
+ goto out_unlock;
/*
* This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
- update_rq_clock(busiest_rq);
- update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
schedstat_inc(sd, alb_failed);
}
double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
}
#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d..83c66e8ad3e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
/*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
-/*
* Only give sleepers 50% of their service deficit. This allows
* them to run sooner, but does not allow tons of sleepers to
* rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
/*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
-/*
* Place new tasks ahead so that they do not starve already running
* tasks
*/
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
/*
- * Compute wakeup_gran based on task behaviour, clipped to
- * [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
-/*
* Based on load and program behaviour, see if it makes sense to place
* a newly woken task on the same cpu as the task that woke it --
* improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
/*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
-/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a5094..9fa0f402c87 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
*/
#ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- /* adjust the active tasks as we might go into a long sleep */
- calc_load_account_active(rq);
+ calc_load_account_idle(rq);
return rq->idle;
}
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
* message if some code attempts to do it:
*/
static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea..8afb953e31c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
- schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+ schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Adding/removing a task to/from a priority array:
*/
static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
- if (wakeup)
+ if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, head);
+ enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
{
- struct rq *rq = task_rq(p);
-
if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef027..0db913a5c60 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
preempt_enable_no_resched();
cond_resched();
preempt_disable();
- rcu_sched_qs((long)__bind_cpu);
+ rcu_note_context_switch((long)__bind_cpu);
}
preempt_enable();
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79..b4e7431e7c7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005 IBM Corporation.
+ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
*/
+#include <linux/completion.h>
#include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/stop_machine.h>
-#include <linux/syscalls.h>
#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
#include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors. May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+ atomic_t nr_todo; /* nr left to execute */
+ bool executed; /* actually executed? */
+ int ret; /* collected return value */
+ struct completion completion; /* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+ spinlock_t lock;
+ struct list_head works; /* list of pending works */
+ struct task_struct *thread; /* stopper thread */
+ bool enabled; /* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+ memset(done, 0, sizeof(*done));
+ atomic_set(&done->nr_todo, nr_todo);
+ init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+ if (done) {
+ if (executed)
+ done->executed = true;
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
+ }
+}
+
+/* queue @work to @stopper. if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+ struct cpu_stop_work *work)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+
+ if (stopper->enabled) {
+ list_add_tail(&work->list, &stopper->works);
+ wake_up_process(stopper->thread);
+ } else
+ cpu_stop_signal_done(work->done, false);
+
+ spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu. @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it. This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes. If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus. @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+ cpu_stop_init_done(&done, 1);
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion. The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf)
+{
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_work *work;
+ struct cpu_stop_done done;
+ unsigned int cpu;
+
+ /* initialize works and done */
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(stop_cpus_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = &done;
+ }
+ cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+ /*
+ * Disable preemption while queueing to avoid getting
+ * preempted by a stopper which might wait for other stoppers
+ * to enter @fn which can lead to deadlock.
+ */
+ preempt_disable();
+ for_each_cpu(cpu, cpumask)
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+ &per_cpu(stop_cpus_work, cpu));
+ preempt_enable();
+
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it. This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes. If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus. @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ mutex_lock(&stop_cpus_mutex);
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ if (!mutex_trylock(&stop_cpus_mutex))
+ return -EAGAIN;
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+ struct cpu_stopper *stopper = data;
+ struct cpu_stop_work *work;
+ int ret;
+
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&stopper->lock);
+ if (!list_empty(&stopper->works)) {
+ work = list_first_entry(&stopper->works,
+ struct cpu_stop_work, list);
+ list_del_init(&work->list);
+ }
+ spin_unlock_irq(&stopper->lock);
+
+ if (work) {
+ cpu_stop_fn_t fn = work->fn;
+ void *arg = work->arg;
+ struct cpu_stop_done *done = work->done;
+ char ksym_buf[KSYM_NAME_LEN];
+
+ __set_current_state(TASK_RUNNING);
+
+ /* cpu stop callbacks are not allowed to sleep */
+ preempt_disable();
+
+ ret = fn(arg);
+ if (ret)
+ done->ret = ret;
+
+ /* restore preemption and check it's still balanced */
+ preempt_enable();
+ WARN_ONCE(preempt_count(),
+ "cpu_stop: %s(%p) leaked preempt count\n",
+ kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+ ksym_buf), arg);
+
+ cpu_stop_signal_done(done, true);
+ } else
+ schedule();
+
+ goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct task_struct *p;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ BUG_ON(stopper->thread || stopper->enabled ||
+ !list_empty(&stopper->works));
+ p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+ cpu);
+ if (IS_ERR(p))
+ return NOTIFY_BAD;
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+ get_task_struct(p);
+ stopper->thread = p;
+ break;
+
+ case CPU_ONLINE:
+ kthread_bind(stopper->thread, cpu);
+ /* strictly unnecessary, as first user will wake it */
+ wake_up_process(stopper->thread);
+ /* mark enabled */
+ spin_lock_irq(&stopper->lock);
+ stopper->enabled = true;
+ spin_unlock_irq(&stopper->lock);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ {
+ struct cpu_stop_work *work;
+
+ /* kill the stopper */
+ kthread_stop(stopper->thread);
+ /* drain remaining works */
+ spin_lock_irq(&stopper->lock);
+ list_for_each_entry(work, &stopper->works, list)
+ cpu_stop_signal_done(work->done, false);
+ stopper->enabled = false;
+ spin_unlock_irq(&stopper->lock);
+ /* release the stopper */
+ put_task_struct(stopper->thread);
+ stopper->thread = NULL;
+ break;
+ }
+#endif
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers. It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+ .notifier_call = cpu_stop_cpu_callback,
+ .priority = 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+ void *bcpu = (void *)(long)smp_processor_id();
+ unsigned int cpu;
+ int err;
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_init(&stopper->lock);
+ INIT_LIST_HEAD(&stopper->works);
+ }
+
+ /* start one for the boot cpu */
+ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+ bcpu);
+ BUG_ON(err == NOTIFY_BAD);
+ cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+ register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+ return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
/* This controls the threads on each CPU. */
enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
/* Exit */
STOPMACHINE_EXIT,
};
-static enum stopmachine_state state;
struct stop_machine_data {
- int (*fn)(void *);
- void *data;
- int fnret;
+ int (*fn)(void *);
+ void *data;
+ /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+ unsigned int num_threads;
+ const struct cpumask *active_cpus;
+
+ enum stopmachine_state state;
+ atomic_t thread_ack;
};
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+ enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, num_threads);
+ atomic_set(&smdata->thread_ack, smdata->num_threads);
smp_wmb();
- state = newstate;
+ smdata->state = newstate;
}
/* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
{
- if (atomic_dec_and_test(&thread_ack))
- set_state(state + 1);
+ if (atomic_dec_and_test(&smdata->thread_ack))
+ set_state(smdata, smdata->state + 1);
}
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
{
+ struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
- struct stop_machine_data *smdata = &idle;
- int cpu = smp_processor_id();
- int err;
+ int cpu = smp_processor_id(), err = 0;
+ bool is_active;
+
+ if (!smdata->active_cpus)
+ is_active = cpu == cpumask_first(cpu_online_mask);
+ else
+ is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
- if (!active_cpus) {
- if (cpu == cpumask_first(cpu_online_mask))
- smdata = &active;
- } else {
- if (cpumask_test_cpu(cpu, active_cpus))
- smdata = &active;
- }
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
cpu_relax();
- if (state != curstate) {
- curstate = state;
+ if (smdata->state != curstate) {
+ curstate = smdata->state;
switch (curstate) {
case STOPMACHINE_DISABLE_IRQ:
local_irq_disable();
hard_irq_disable();
break;
case STOPMACHINE_RUN:
- /* On multiple CPUs only a single error code
- * is needed to tell that something failed. */
- err = smdata->fn(smdata->data);
- if (err)
- smdata->fnret = err;
+ if (is_active)
+ err = smdata->fn(smdata->data);
break;
default:
break;
}
- ack_state();
+ ack_state(smdata);
}
} while (curstate != STOPMACHINE_EXIT);
local_irq_enable();
+ return err;
}
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
- return 0;
-}
-
-int stop_machine_create(void)
-{
- mutex_lock(&setup_lock);
- if (refcount)
- goto done;
- stop_machine_wq = create_rt_workqueue("kstop");
- if (!stop_machine_wq)
- goto err_out;
- stop_machine_work = alloc_percpu(struct work_struct);
- if (!stop_machine_work)
- goto err_out;
-done:
- refcount++;
- mutex_unlock(&setup_lock);
- return 0;
-
-err_out:
- if (stop_machine_wq)
- destroy_workqueue(stop_machine_wq);
- mutex_unlock(&setup_lock);
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
- mutex_lock(&setup_lock);
- refcount--;
- if (refcount)
- goto done;
- destroy_workqueue(stop_machine_wq);
- free_percpu(stop_machine_work);
-done:
- mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
- struct work_struct *sm_work;
- int i, ret;
-
- /* Set up initial state. */
- mutex_lock(&lock);
- num_threads = num_online_cpus();
- active_cpus = cpus;
- active.fn = fn;
- active.data = data;
- active.fnret = 0;
- idle.fn = chill;
- idle.data = NULL;
-
- set_state(STOPMACHINE_PREPARE);
-
- /* Schedule the stop_cpu work on all cpus: hold this CPU so one
- * doesn't hit this CPU until we're ready. */
- get_cpu();
- for_each_online_cpu(i) {
- sm_work = per_cpu_ptr(stop_machine_work, i);
- INIT_WORK(sm_work, stop_cpu);
- queue_work_on(i, stop_machine_wq, sm_work);
- }
- /* This will release the thread on our CPU. */
- put_cpu();
- flush_workqueue(stop_machine_wq);
- ret = active.fnret;
- mutex_unlock(&lock);
- return ret;
+ struct stop_machine_data smdata = { .fn = fn, .data = data,
+ .num_threads = num_online_cpus(),
+ .active_cpus = cpus };
+
+ /* Set the initial state and stop all online cpus. */
+ set_state(&smdata, STOPMACHINE_PREPARE);
+ return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
}
int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
int ret;
- ret = stop_machine_create();
- if (ret)
- return ret;
/* No CPUs can come up or down during this. */
get_online_cpus();
ret = __stop_machine(fn, data, cpus);
put_online_cpus();
- stop_machine_destroy();
return ret;
}
EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f5..1d7b9bc1c03 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog();
}
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+ ktime_t delta;
+
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ if (nr_iowait_cpu() > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ ts->idle_entrytime = now;
+ }
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+}
+
static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t delta;
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now, delta;
+ ktime_t now;
now = ktime_get();
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- }
+
+ update_ts_time_stats(ts, now, NULL);
+
ts->idle_entrytime = now;
ts->idle_active = 1;
sched_clock_idle_sleep_event();
return now;
}
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- if (ts->idle_active)
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
- else
- *last_update_time = ktime_to_us(ktime_get());
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
return ktime_to_us(ts->idle_sleeptime);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+ return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
goto end;
}
+ if (nohz_ratelimit(cpu))
+ goto end;
+
ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd7877..ab8f5e33fa9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_waketime);
P_ns(idle_exittime);
P_ns(idle_sleeptime);
+ P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd..8b1797c4545 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
help
See Documentation/trace/ftrace-design.txt
-config HAVE_HW_BRANCH_TRACER
- bool
-
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -374,14 +371,6 @@ config STACK_TRACER
Say N if unsure.
-config HW_BRANCH_TRACER
- depends on HAVE_HW_BRANCH_TRACER
- bool "Trace hw branches"
- select GENERIC_TRACER
- help
- This tracer records all branches on the system in a circular
- buffer, giving access to the last N branches for each cpu.
-
config KMEMTRACE
bool "Trace SLAB allocations"
select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc649003..ffb1a5b0550 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c9c2934c45..32837e19e3b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3234,8 +3234,7 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 911e9864e94..d1ce0bec1b3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
- TRACE_HW_BRANCHES,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
long ret;
};
-struct kprobe_trace_entry {
+struct kprobe_trace_entry_head {
struct trace_entry ent;
unsigned long ip;
- int nargs;
- unsigned long args[];
};
-#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
- (offsetof(struct kprobe_trace_entry, args) + \
- (sizeof(unsigned long) * (n)))
-
-struct kretprobe_trace_entry {
+struct kretprobe_trace_entry_head {
struct trace_entry ent;
unsigned long func;
unsigned long ret_ip;
- int nargs;
- unsigned long args[];
};
-#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
- (offsetof(struct kretprobe_trace_entry, args) + \
- (sizeof(unsigned long) * (n)))
-
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
- IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -470,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_ksym(struct tracer *trace,
struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399d..dc008c1240d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
__entry->func, __entry->file, __entry->correct)
);
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
- TRACE_HW_BRANCHES,
-
- F_STRUCT(
- __field( u64, from )
- __field( u64, to )
- ),
-
- F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7f..58092d844a1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
}
err = -EINVAL;
- if (!call)
+ if (&call->list == &ftrace_events)
goto out_unlock;
err = -EEXIST;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f..00000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
- per_cpu(hwb_tracer, cpu) =
- ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
- BTS_BUFFER_SIZE, NULL, (size_t)-1,
- BTS_KERNEL);
-
- if (IS_ERR(per_cpu(hwb_tracer, cpu)))
- per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
- int cpu;
-
- hw_branch_trace = tr;
- trace_hw_branches_enabled = 0;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- bts_trace_init_cpu(cpu);
-
- if (likely(per_cpu(hwb_tracer, cpu)))
- trace_hw_branches_enabled = 1;
- }
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-
- /* If we could not enable tracing on a single cpu, we fail. */
- return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- if (likely(per_cpu(hwb_tracer, cpu))) {
- ds_release_bts(per_cpu(hwb_tracer, cpu));
- per_cpu(hwb_tracer, cpu) = NULL;
- }
- }
- trace_hw_branches_enabled = 0;
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(hwb_tracer, cpu)))
- ds_resume_bts(per_cpu(hwb_tracer, cpu));
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(hwb_tracer, cpu)))
- ds_suspend_bts(per_cpu(hwb_tracer, cpu));
- trace_hw_branches_suspended = 1;
- put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- /* The notification is sent with interrupts enabled. */
- if (trace_hw_branches_enabled) {
- bts_trace_init_cpu(cpu);
-
- if (trace_hw_branches_suspended &&
- likely(per_cpu(hwb_tracer, cpu)))
- ds_suspend_bts(per_cpu(hwb_tracer, cpu));
- }
- break;
-
- case CPU_DOWN_PREPARE:
- /* The notification is sent with interrupts enabled. */
- if (likely(per_cpu(hwb_tracer, cpu))) {
- ds_release_bts(per_cpu(hwb_tracer, cpu));
- per_cpu(hwb_tracer, cpu) = NULL;
- }
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
- .notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
- seq_puts(m, "# CPU# TO <- FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
- unsigned long symflags = TRACE_ITER_SYM_OFFSET;
- struct trace_entry *entry = iter->ent;
- struct trace_seq *seq = &iter->seq;
- struct hw_branch_entry *it;
-
- trace_assign_type(it, entry);
-
- if (entry->type == TRACE_HW_BRANCHES) {
- if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
- seq_print_ip_sym(seq, it->to, symflags) &&
- trace_seq_printf(seq, "\t <- ") &&
- seq_print_ip_sym(seq, it->from, symflags) &&
- trace_seq_printf(seq, "\n"))
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
- struct ftrace_event_call *call = &event_hw_branch;
- struct trace_array *tr = hw_branch_trace;
- struct ring_buffer_event *event;
- struct ring_buffer *buf;
- struct hw_branch_entry *entry;
- unsigned long irq1;
- int cpu;
-
- if (unlikely(!tr))
- return;
-
- if (unlikely(!trace_hw_branches_enabled))
- return;
-
- local_irq_save(irq1);
- cpu = raw_smp_processor_id();
- if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
- goto out;
-
- buf = tr->buffer;
- event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, from);
- entry->ent.type = TRACE_HW_BRANCHES;
- entry->from = from;
- entry->to = to;
- if (!filter_check_discard(call, entry, buf, event))
- trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
- atomic_dec(&tr->data[cpu]->disabled);
- local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
- struct bts_struct bts;
- int err = 0;
-
- WARN_ON_ONCE(!trace->read);
- if (!trace->read)
- return;
-
- err = trace->read(this_tracer, at, &bts);
- if (err < 0)
- return;
-
- switch (bts.qualifier) {
- case BTS_BRANCH:
- trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
- break;
- }
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
- struct trace_array *tr = (struct trace_array *)arg;
- const struct bts_trace *trace;
- unsigned char *at;
-
- if (unlikely(!tr))
- return;
-
- if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
- return;
-
- if (unlikely(!this_tracer))
- return;
-
- trace = ds_read_bts(this_tracer);
- if (!trace)
- return;
-
- for (at = trace->ds.top; (void *)at < trace->ds.end;
- at += trace->ds.size)
- trace_bts_at(trace, at);
-
- for (at = trace->ds.begin; (void *)at < trace->ds.top;
- at += trace->ds.size)
- trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(hwb_tracer, cpu)))
- ds_suspend_bts(per_cpu(hwb_tracer, cpu));
- /*
- * We need to collect the trace on the respective cpu since ftrace
- * implicitly adds the record for the current cpu.
- * Once that is more flexible, we could collect the data from any cpu.
- */
- on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
- for_each_online_cpu(cpu)
- if (likely(per_cpu(hwb_tracer, cpu)))
- ds_resume_bts(per_cpu(hwb_tracer, cpu));
- put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
- tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
- if (this_tracer) {
- ds_suspend_bts_noirq(this_tracer);
- trace_bts_cpu(hw_branch_trace);
- ds_resume_bts_noirq(this_tracer);
- }
-}
-
-struct tracer bts_tracer __read_mostly =
-{
- .name = "hw-branch-tracer",
- .init = bts_trace_init,
- .reset = bts_trace_reset,
- .print_header = bts_trace_print_header,
- .print_line = bts_trace_print_line,
- .start = bts_trace_start,
- .stop = bts_trace_stop,
- .open = trace_bts_prepare,
- .close = trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
- register_hotcpu_notifier(&bts_hotcpu_notifier);
- return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae..a7514326052 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
#include <linux/ctype.h>
#include <linux/ptrace.h>
#include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <asm/bitsperlong.h>
#include "trace.h"
#include "trace_output.h"
@@ -40,7 +42,6 @@
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_NARGS "__probe_nargs"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
"common_tgid",
"common_lock_depth",
FIELD_STRING_IP,
- FIELD_STRING_NARGS,
FIELD_STRING_RETIP,
FIELD_STRING_FUNC,
};
-struct fetch_func {
- unsigned long (*func)(struct pt_regs *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
+#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
+
+/* Printing in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
+ const char *name, void *data)\
+{ \
+ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+} \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+
+struct fetch_param {
+ fetch_func_t fn;
void *data;
};
-static __kprobes unsigned long call_fetch(struct fetch_func *f,
- struct pt_regs *regs)
+static __kprobes void call_fetch(struct fetch_param *fprm,
+ struct pt_regs *regs, void *dest)
{
- return f->func(regs, f->data);
+ return fprm->fn(regs, fprm->data, dest);
}
-/* fetch handlers */
-static __kprobes unsigned long fetch_register(struct pt_regs *regs,
- void *offset)
-{
- return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(kind) \
+DEFINE_FETCH_##kind(u8) \
+DEFINE_FETCH_##kind(u16) \
+DEFINE_FETCH_##kind(u32) \
+DEFINE_FETCH_##kind(u64)
+
+#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
+ ((FETCH_FUNC_NAME(kind, u8) == fn) || \
+ (FETCH_FUNC_NAME(kind, u16) == fn) || \
+ (FETCH_FUNC_NAME(kind, u32) == fn) || \
+ (FETCH_FUNC_NAME(kind, u64) == fn))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type) \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_register(regs, \
+ (unsigned int)((unsigned long)offset)); \
}
-
-static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
- void *num)
-{
- return regs_get_kernel_stack_nth(regs,
- (unsigned int)((unsigned long)num));
+DEFINE_BASIC_FETCH_FUNCS(reg)
+
+#define DEFINE_FETCH_stack(type) \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
+ (unsigned int)((unsigned long)offset)); \
}
+DEFINE_BASIC_FETCH_FUNCS(stack)
-static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
-{
- unsigned long retval;
-
- if (probe_kernel_address(addr, retval))
- return 0;
- return retval;
+#define DEFINE_FETCH_retval(type) \
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+ void *dummy, void *dest) \
+{ \
+ *(type *)dest = (type)regs_return_value(regs); \
}
-
-static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
- void *dummy)
-{
- return regs_return_value(regs);
-}
-
-static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
- void *dummy)
-{
- return kernel_stack_pointer(regs);
+DEFINE_BASIC_FETCH_FUNCS(retval)
+
+#define DEFINE_FETCH_memory(type) \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ if (probe_kernel_address(addr, retval)) \
+ *(type *)dest = 0; \
+ else \
+ *(type *)dest = retval; \
}
+DEFINE_BASIC_FETCH_FUNCS(memory)
/* Memory fetching by symbol */
struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
return sc;
}
-static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
-{
- struct symbol_cache *sc = data;
-
- if (sc->addr)
- return fetch_memory(regs, (void *)sc->addr);
- else
- return 0;
+#define DEFINE_FETCH_symbol(type) \
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct symbol_cache *sc = data; \
+ if (sc->addr) \
+ fetch_memory_##type(regs, (void *)sc->addr, dest); \
+ else \
+ *(type *)dest = 0; \
}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
-/* Special indirect memory access interface */
-struct indirect_fetch_data {
- struct fetch_func orig;
+/* Dereference memory access function */
+struct deref_fetch_param {
+ struct fetch_param orig;
long offset;
};
-static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
-{
- struct indirect_fetch_data *ind = data;
- unsigned long addr;
-
- addr = call_fetch(&ind->orig, regs);
- if (addr) {
- addr += ind->offset;
- return fetch_memory(regs, (void *)addr);
- } else
- return 0;
+#define DEFINE_FETCH_deref(type) \
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct deref_fetch_param *dprm = data; \
+ unsigned long addr; \
+ call_fetch(&dprm->orig, regs, &addr); \
+ if (addr) { \
+ addr += dprm->offset; \
+ fetch_memory_##type(regs, (void *)addr, dest); \
+ } else \
+ *(type *)dest = 0; \
}
+DEFINE_BASIC_FETCH_FUNCS(deref)
-static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
{
- if (data->orig.func == fetch_indirect)
- free_indirect_fetch_data(data->orig.data);
- else if (data->orig.func == fetch_symbol)
+ if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
free_symbol_cache(data->orig.data);
kfree(data);
}
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(kind, type) \
+ .kind = FETCH_FUNC_NAME(kind, type)
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ {.name = #ptype, \
+ .size = sizeof(ftype), \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ }
+
+/* Fetch type information table */
+static const struct fetch_type {
+ const char *name; /* Name of type */
+ size_t size; /* Byte size of type */
+ int is_signed; /* Signed flag */
+ print_type_func_t print; /* Print functions */
+ const char *fmt; /* Fromat string */
+ /* Fetch functions */
+ fetch_func_t reg;
+ fetch_func_t stack;
+ fetch_func_t retval;
+ fetch_func_t memory;
+ fetch_func_t symbol;
+ fetch_func_t deref;
+} fetch_type_table[] = {
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+ int i;
+
+ if (!type)
+ type = DEFAULT_FETCH_TYPE_STR;
+
+ for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+ if (strcmp(type, fetch_type_table[i].name) == 0)
+ return &fetch_type_table[i];
+ return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+ void *dummy, void *dest)
+{
+ *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
/**
* Kprobe event core functions
*/
struct probe_arg {
- struct fetch_func fetch;
- const char *name;
+ struct fetch_param fetch;
+ unsigned int offset; /* Offset from argument entry */
+ const char *name; /* Name of this argument */
+ const char *comm; /* Command of this argument */
+ const struct fetch_type *type; /* Type of this argument */
};
/* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
const char *symbol; /* symbol name */
struct ftrace_event_call call;
struct trace_event event;
+ ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
};
@@ -212,6 +335,7 @@ struct trace_probe {
(offsetof(struct trace_probe, args) + \
(sizeof(struct probe_arg) * (n)))
+
static __kprobes int probe_is_return(struct trace_probe *tp)
{
return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
return tp->symbol ? tp->symbol : "unknown";
}
-static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
-{
- int ret = -EINVAL;
-
- if (ff->func == fetch_register) {
- const char *name;
- name = regs_query_register_name((unsigned int)((long)ff->data));
- ret = snprintf(buf, n, "%%%s", name);
- } else if (ff->func == fetch_stack)
- ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
- else if (ff->func == fetch_memory)
- ret = snprintf(buf, n, "@0x%p", ff->data);
- else if (ff->func == fetch_symbol) {
- struct symbol_cache *sc = ff->data;
- if (sc->offset)
- ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
- sc->offset);
- else
- ret = snprintf(buf, n, "@%s", sc->symbol);
- } else if (ff->func == fetch_retvalue)
- ret = snprintf(buf, n, "$retval");
- else if (ff->func == fetch_stack_address)
- ret = snprintf(buf, n, "$stack");
- else if (ff->func == fetch_indirect) {
- struct indirect_fetch_data *id = ff->data;
- size_t l = 0;
- ret = snprintf(buf, n, "%+ld(", id->offset);
- if (ret >= n)
- goto end;
- l += ret;
- ret = probe_arg_string(buf + l, n - l, &id->orig);
- if (ret < 0)
- goto end;
- l += ret;
- ret = snprintf(buf + l, n - l, ")");
- ret += l;
- }
-end:
- if (ret >= n)
- return -ENOSPC;
- return ret;
-}
-
static int register_probe_event(struct trace_probe *tp);
static void unregister_probe_event(struct trace_probe *tp);
@@ -347,11 +428,12 @@ error:
static void free_probe_arg(struct probe_arg *arg)
{
- if (arg->fetch.func == fetch_symbol)
+ if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+ free_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
free_symbol_cache(arg->fetch.data);
- else if (arg->fetch.func == fetch_indirect)
- free_indirect_fetch_data(arg->fetch.data);
kfree(arg->name);
+ kfree(arg->comm);
}
static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
#define PARAM_MAX_ARGS 16
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, int is_return)
{
int ret = 0;
unsigned long param;
if (strcmp(arg, "retval") == 0) {
- if (is_return) {
- ff->func = fetch_retvalue;
- ff->data = NULL;
- } else
+ if (is_return)
+ f->fn = t->retval;
+ else
ret = -EINVAL;
} else if (strncmp(arg, "stack", 5) == 0) {
if (arg[5] == '\0') {
- ff->func = fetch_stack_address;
- ff->data = NULL;
+ if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+ f->fn = fetch_stack_address;
+ else
+ ret = -EINVAL;
} else if (isdigit(arg[5])) {
ret = strict_strtoul(arg + 5, 10, &param);
if (ret || param > PARAM_MAX_STACK)
ret = -EINVAL;
else {
- ff->func = fetch_stack;
- ff->data = (void *)param;
+ f->fn = t->stack;
+ f->data = (void *)param;
}
} else
ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
}
/* Recursive argument parser */
-static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, int is_return)
{
int ret = 0;
unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, ff, is_return);
+ ret = parse_probe_vars(arg + 1, t, f, is_return);
break;
case '%': /* named register */
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
- ff->func = fetch_register;
- ff->data = (void *)(unsigned long)ret;
+ f->fn = t->reg;
+ f->data = (void *)(unsigned long)ret;
ret = 0;
}
break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
ret = strict_strtoul(arg + 1, 0, &param);
if (ret)
break;
- ff->func = fetch_memory;
- ff->data = (void *)param;
+ f->fn = t->memory;
+ f->data = (void *)param;
} else {
ret = split_symbol_offset(arg + 1, &offset);
if (ret)
break;
- ff->data = alloc_symbol_cache(arg + 1, offset);
- if (ff->data)
- ff->func = fetch_symbol;
- else
- ret = -EINVAL;
+ f->data = alloc_symbol_cache(arg + 1, offset);
+ if (f->data)
+ f->fn = t->symbol;
}
break;
- case '+': /* indirect memory */
+ case '+': /* deref memory */
case '-':
tmp = strchr(arg, '(');
- if (!tmp) {
- ret = -EINVAL;
+ if (!tmp)
break;
- }
*tmp = '\0';
ret = strict_strtol(arg + 1, 0, &offset);
if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
arg = tmp + 1;
tmp = strrchr(arg, ')');
if (tmp) {
- struct indirect_fetch_data *id;
+ struct deref_fetch_param *dprm;
+ const struct fetch_type *t2 = find_fetch_type(NULL);
*tmp = '\0';
- id = kzalloc(sizeof(struct indirect_fetch_data),
- GFP_KERNEL);
- if (!id)
+ dprm = kzalloc(sizeof(struct deref_fetch_param),
+ GFP_KERNEL);
+ if (!dprm)
return -ENOMEM;
- id->offset = offset;
- ret = __parse_probe_arg(arg, &id->orig, is_return);
+ dprm->offset = offset;
+ ret = __parse_probe_arg(arg, t2, &dprm->orig,
+ is_return);
if (ret)
- kfree(id);
+ kfree(dprm);
else {
- ff->func = fetch_indirect;
- ff->data = (void *)id;
+ f->fn = t->deref;
+ f->data = (void *)dprm;
}
- } else
- ret = -EINVAL;
+ }
break;
- default:
- /* TODO: support custom handler */
- ret = -EINVAL;
}
+ if (!ret && !f->fn)
+ ret = -EINVAL;
return ret;
}
/* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+ struct probe_arg *parg, int is_return)
{
+ const char *t;
+
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
return -ENOSPC;
}
- return __parse_probe_arg(arg, ff, is_return);
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ return -ENOMEM;
+ }
+ t = strchr(parg->comm, ':');
+ if (t) {
+ arg[t - parg->comm] = '\0';
+ t++;
+ }
+ parg->type = find_fetch_type(t);
+ if (!parg->type) {
+ pr_info("Unsupported type: %s\n", t);
+ return -EINVAL;
+ }
+ parg->offset = tp->size;
+ tp->size += parg->type->size;
+ return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
}
/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
* @ADDR : fetch memory at ADDR (ADDR should be in kernel)
* @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
* %REG : fetch register REG
- * Indirect memory fetch:
+ * Dereferencing memory fetch:
* +|-offs(ARG) : fetch memory at ARG +|- offs address.
* Alias name of args:
* NAME=FETCHARG : set NAME as alias of FETCHARG.
+ * Type of args:
+ * FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_probe *tp;
int i, ret = 0;
int is_return = 0, is_delete = 0;
- char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+ char *symbol = NULL, *event = NULL, *group = NULL;
+ char *arg, *tmp;
unsigned long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
else
arg = argv[i];
- if (conflict_field_name(argv[i], tp->args, i)) {
- pr_info("Argument%d name '%s' conflicts with "
- "another field.\n", i, argv[i]);
- ret = -EINVAL;
- goto error;
- }
-
tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
if (!tp->args[i].name) {
pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
ret = -ENOMEM;
goto error;
}
+ tmp = strchr(tp->args[i].name, ':');
+ if (tmp)
+ *tmp = '_'; /* convert : to _ */
+
+ if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+ pr_info("Argument%d name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
/* Parse fetch argument */
- ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+ ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
if (ret) {
pr_info("Parse error at argument%d. (%d)\n", i, ret);
kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
static int probes_seq_show(struct seq_file *m, void *v)
{
struct trace_probe *tp = v;
- int i, ret;
- char buf[MAX_ARGSTR_LEN + 1];
+ int i;
seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
else
seq_printf(m, " %s", probe_symbol(tp));
- for (i = 0; i < tp->nr_args; i++) {
- ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
- if (ret < 0) {
- pr_warning("Argument%d decoding error(%d).\n", i, ret);
- return ret;
- }
- seq_printf(m, " %s=%s", tp->args[i].name, buf);
- }
+ for (i = 0; i < tp->nr_args; i++)
+ seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
seq_printf(m, "\n");
+
return 0;
}
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
- struct kprobe_trace_entry *entry;
+ struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
+ u8 *data;
int size, i, pc;
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
local_save_flags(irq_flags);
pc = preempt_count();
- size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+ size = sizeof(*entry) + tp->size;
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
return;
entry = ring_buffer_event_data(event);
- entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
+ data = (u8 *)&entry[1];
for (i = 0; i < tp->nr_args; i++)
- entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
- struct kretprobe_trace_entry *entry;
+ struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
+ u8 *data;
int size, i, pc;
unsigned long irq_flags;
struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
local_save_flags(irq_flags);
pc = preempt_count();
- size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+ size = sizeof(*entry) + tp->size;
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
return;
entry = ring_buffer_event_data(event);
- entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
+ data = (u8 *)&entry[1];
for (i = 0; i < tp->nr_args; i++)
- entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
enum print_line_t
print_kprobe_event(struct trace_iterator *iter, int flags)
{
- struct kprobe_trace_entry *field;
+ struct kprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
struct trace_event *event;
struct trace_probe *tp;
+ u8 *data;
int i;
- field = (struct kprobe_trace_entry *)iter->ent;
+ field = (struct kprobe_trace_entry_head *)iter->ent;
event = ftrace_find_event(field->ent.type);
tp = container_of(event, struct trace_probe, event);
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
if (!trace_seq_puts(s, ")"))
goto partial;
- for (i = 0; i < field->nargs; i++)
- if (!trace_seq_printf(s, " %s=%lx",
- tp->args[i].name, field->args[i]))
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset))
goto partial;
if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
enum print_line_t
print_kretprobe_event(struct trace_iterator *iter, int flags)
{
- struct kretprobe_trace_entry *field;
+ struct kretprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
struct trace_event *event;
struct trace_probe *tp;
+ u8 *data;
int i;
- field = (struct kretprobe_trace_entry *)iter->ent;
+ field = (struct kretprobe_trace_entry_head *)iter->ent;
event = ftrace_find_event(field->ent.type);
tp = container_of(event, struct trace_probe, event);
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
if (!trace_seq_puts(s, ")"))
goto partial;
- for (i = 0; i < field->nargs; i++)
- if (!trace_seq_printf(s, " %s=%lx",
- tp->args[i].name, field->args[i]))
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset))
goto partial;
if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
{
int ret, i;
- struct kprobe_trace_entry field;
+ struct kprobe_trace_entry_head field;
struct trace_probe *tp = (struct trace_probe *)event_call->data;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
- DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
/* Set argument names as fields */
- for (i = 0; i < tp->nr_args; i++)
- DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = trace_define_field(event_call, tp->args[i].type->name,
+ tp->args[i].name,
+ sizeof(field) + tp->args[i].offset,
+ tp->args[i].type->size,
+ tp->args[i].type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
return 0;
}
static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
{
int ret, i;
- struct kretprobe_trace_entry field;
+ struct kretprobe_trace_entry_head field;
struct trace_probe *tp = (struct trace_probe *)event_call->data;
DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
- DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
/* Set argument names as fields */
- for (i = 0; i < tp->nr_args; i++)
- DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = trace_define_field(event_call, tp->args[i].type->name,
+ tp->args[i].name,
+ sizeof(field) + tp->args[i].offset,
+ tp->args[i].type->size,
+ tp->args[i].type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
for (i = 0; i < tp->nr_args; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
- tp->args[i].name);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+ tp->args[i].name, tp->args[i].type->fmt);
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
- struct kprobe_trace_entry *entry;
+ struct kprobe_trace_entry_head *entry;
+ u8 *data;
int size, __size, i;
unsigned long irq_flags;
int rctx;
- __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+ __size = sizeof(*entry) + tp->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
if (!entry)
return;
- entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
+ data = (u8 *)&entry[1];
for (i = 0; i < tp->nr_args; i++)
- entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
}
@@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
- struct kretprobe_trace_entry *entry;
+ struct kretprobe_trace_entry_head *entry;
+ u8 *data;
int size, __size, i;
unsigned long irq_flags;
int rctx;
- __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+ __size = sizeof(*entry) + tp->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
if (!entry)
return;
- entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
+ data = (u8 *)&entry[1];
for (i = 0; i < tp->nr_args; i++)
- entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+ call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
irq_flags, regs);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd687947..8eaf00749b6 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
#include <asm/atomic.h>
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
-
#define KSYM_TRACER_OP_LEN 3 /* rw- */
struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
static struct trace_array *ksym_trace_array;
-static unsigned int ksym_filter_entry_count;
static unsigned int ksym_tracing_enabled;
static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
struct trace_ksym *entry;
int ret = -ENOMEM;
- if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
- printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
- " new requests for tracing can be accepted now.\n",
- KSYM_TRACER_MAX);
- return -ENOSPC;
- }
-
entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
if (!entry)
return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
if (IS_ERR(entry->ksym_hbp)) {
ret = PTR_ERR(entry->ksym_hbp);
- printk(KERN_INFO "ksym_tracer request failed. Try again"
- " later!!\n");
+ if (ret == -ENOSPC) {
+ printk(KERN_ERR "ksym_tracer: Maximum limit reached."
+ " No new requests for tracing can be accepted now.\n");
+ } else {
+ printk(KERN_INFO "ksym_tracer request failed. Try again"
+ " later!!\n");
+ }
goto err;
}
hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
- ksym_filter_entry_count++;
return 0;
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
ksym_hlist) {
unregister_wide_hw_breakpoint(entry->ksym_hbp);
- ksym_filter_entry_count--;
hlist_del_rcu(&(entry->ksym_hlist));
synchronize_rcu();
kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
goto out_unlock;
}
/* Error or "symbol:---" case: drop it */
- ksym_filter_entry_count--;
hlist_del_rcu(&(entry->ksym_hlist));
synchronize_rcu();
kfree(entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde..a55fccfede5 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
}
static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+probe_sched_switch(struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
}
static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(struct task_struct *wakee, int success)
{
struct trace_array_cpu *data;
unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8..8052446ceea 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
}
static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 6a9d36ddfcf..250e7f9bd2f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
- case TRACE_HW_BRANCHES:
case TRACE_KSYM:
return 1;
}
@@ -756,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_BRANCH_TRACER */
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
- struct trace_array *tr)
-{
- struct trace_iterator *iter;
- struct tracer tracer;
- unsigned long count;
- int ret;
-
- if (!trace->open) {
- printk(KERN_CONT "missing open function...");
- return -1;
- }
-
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /*
- * The hw-branch tracer needs to collect the trace from the various
- * cpu trace buffers - before tracing is stopped.
- */
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- memcpy(&tracer, trace, sizeof(tracer));
-
- iter->trace = &tracer;
- iter->tr = tr;
- iter->pos = -1;
- mutex_init(&iter->mutex);
-
- trace->open(iter);
-
- mutex_destroy(&iter->mutex);
- kfree(iter);
-
- tracing_stop();
-
- ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT "no entries found..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
#ifdef CONFIG_KSYM_TRACER
static int ksym_selftest_dummy;
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb..7e72614b736 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/user_namespace.h>
-#include "cred-internals.h"
struct user_namespace init_user_ns = {
.kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up, *new;
- /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
- * atomic.
- */
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- /* This case is not possible when CONFIG_USER_SCHED
- * is defined, since we serialize alloc_uid() using
- * uids_mutex. Hence no need to call
- * sched_destroy_user() or remove_user_sysfs_dir().
- */
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
return up;
- put_user_ns(new->user_ns);
- kmem_cache_free(uid_cachep, new);
out_unlock:
return NULL;
}