aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c13
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/hw_breakpoint.c8
-rw-r--r--kernel/irq/Kconfig53
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c378
-rw-r--r--kernel/irq/dummychip.c68
-rw-r--r--kernel/irq/handle.c341
-rw-r--r--kernel/irq/internals.h39
-rw-r--r--kernel/irq/irqdesc.c395
-rw-r--r--kernel/irq/manage.c87
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/proc.c26
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/perf_event.c278
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c17
-rw-r--r--kernel/rcutree.c92
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c307
-rw-r--r--kernel/sched_fair.c81
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/softirq.c73
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_functions_graph.c86
-rw-r--r--kernel/trace/trace_irqsoff.c152
-rw-r--r--kernel/trace/trace_sched_wakeup.c256
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/watchdog.c2
57 files changed, 2854 insertions, 1287 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d52b473c99a..e2c9d52cfe9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f614..291ba3d04be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state __rcu *css;
/*
* ID of this css.
*/
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe..51b143e2a07 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk, 0, NULL);
+ ret = security_task_setscheduler(tsk);
if (ret)
return ret;
if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c, 0, NULL);
+ ret = security_task_setscheduler(c);
if (ret) {
rcu_read_unlock();
return ret;
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526..a118bf160e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
* Slow path to fixup the fault we just took in the atomic write
* access to @uaddr.
*
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
* user address. We know that we faulted in the atomic pagefault
* disabled section so we can as well avoid the #PF overhead by
* calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non PI and PI futexes
+ * Userspace might have messed up non-PI and PI futexes
*/
if (unlikely(!pi_state))
return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non futex wake up happens on another CPU then the task
- * might exit and p would dereference a non existing task
+ * a non-futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non-existing task
* struct. Prevent this by holding a reference on p across the
* wake up.
*/
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1: source futex user address
- * uaddr2: target futex user address
- * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * nr_requeue: number of waiters to requeue (0-INT_MAX)
- * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * @uaddr1: source futex user address
+ * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
* pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
/* The key must be already stored in q->key. */
static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+ __acquires(&hb->lock)
{
struct futex_hash_bucket *hb;
- get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
spin_unlock(&hb->lock);
- drop_futex_key_refs(&q->key);
}
/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
* an example).
*/
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
int prio;
@@ -1471,6 +1475,7 @@ retry:
* and dropped here.
*/
static void unqueue_me_pi(struct futex_q *q)
+ __releases(q->lock_ptr)
{
WARN_ON(plist_node_empty(&q->list));
plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
-
- drop_futex_key_refs(&q->key);
}
/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
}
retry:
- /* Prepare to wait on uaddr. */
+ /*
+ * Prepare to wait on uaddr. On success, holds hb lock and increments
+ * q.key refs.
+ */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
goto out;
@@ -1822,28 +1828,27 @@ retry:
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
+ /* unqueue_me() drops q.key ref */
if (!unqueue_me(&q))
- goto out_put_key;
+ goto out;
ret = -ETIMEDOUT;
if (to && !to->task)
- goto out_put_key;
+ goto out;
/*
* We expect signal_pending(current), but we might be the
* victim of a spurious wakeup as well.
*/
- if (!signal_pending(current)) {
- put_futex_key(fshared, &q.key);
+ if (!signal_pending(current))
goto retry;
- }
ret = -ERESTARTSYS;
if (!abs_time)
- goto out_put_key;
+ goto out;
restart = &current_thread_info()->restart_block;
restart->fn = futex_wait_restart;
- restart->futex.uaddr = (u32 *)uaddr;
+ restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
- put_futex_key(fshared, &q.key);
out:
if (to) {
hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
static long futex_wait_restart(struct restart_block *restart)
{
- u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+ u32 __user *uaddr = restart->futex.uaddr;
int fshared = 0;
ktime_t t, *tp = NULL;
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
- /* Prepare to wait on uaddr. */
+ /*
+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+ * count.
+ */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* In order for us to be here, we know our q.key == key2, and since
* we took the hb->lock above, we also know that futex_requeue() has
* completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquition by the requeue code.
+ * race with the atomic proxy lock acquisition by the requeue code. The
+ * futex_requeue dropped our key1 reference and incremented our key2
+ * reference count.
*/
/* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
- int *pi)
+ unsigned int *pi)
{
unsigned long uentry;
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
* of the complex code paths. Also we want to prevent
* registration of robust lists in that case. NULL is
* guaranteed to fault and we get -EFAULT on functional
- * implementation, the non functional ones will return
+ * implementation, the non-functional ones will return
* -ENOSYS.
*/
curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e..06da4dfc339 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
*/
static inline int
fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, int *pi)
+ compat_uptr_t __user *head, unsigned int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1decafbb6b1..72206cf5c6c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
+ unsigned long state;
int reprogram;
/*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
- __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
- reprogram);
+ /*
+ * We must preserve the CALLBACK state flag here,
+ * otherwise we could move the timer base in
+ * switch_hrtimer_base.
+ */
+ state = timer->state & HRTIMER_STATE_CALLBACK;
+ __remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
@@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base);
}
+
+ WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
timer->state &= ~HRTIMER_STATE_CALLBACK;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac..53ead174da2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- __debug_show_held_locks(t);
+ debug_show_held_locks(t);
touch_nmi_watchdog();
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* periodically exit the critical section and enter a new one.
*
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
*/
static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 3b714e839c1..2c9120f0afc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
*/
static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
{
- struct perf_event_context *ctx = bp->ctx;
+ struct task_struct *tsk = bp->hw.bp_target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->ctx == ctx && find_slot_idx(iter) == type)
+ if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
count += hw_breakpoint_weight(iter);
}
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
enum bp_type_idx type)
{
int cpu = bp->cpu;
- struct task_struct *tsk = bp->ctx->task;
+ struct task_struct *tsk = bp->hw.bp_target;
if (cpu >= 0) {
slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
int cpu = bp->cpu;
- struct task_struct *tsk = bp->ctx->task;
+ struct task_struct *tsk = bp->hw.bp_target;
/* Pinned counter cpu profiling */
if (!tsk) {
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 00000000000..31d766bf5d2
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
+config HAVE_GENERIC_HARDIRQS
+ def_bool n
+
+if HAVE_GENERIC_HARDIRQS
+menu "IRQ subsystem"
+#
+# Interrupt subsystem related configuration options
+#
+config GENERIC_HARDIRQS
+ def_bool y
+
+config GENERIC_HARDIRQS_NO__DO_IRQ
+ def_bool y
+
+# Select this to disable the deprecated stuff
+config GENERIC_HARDIRQS_NO_DEPRECATED
+ def_bool n
+
+# Options selectable by the architecture code
+config HAVE_SPARSE_IRQ
+ def_bool n
+
+config GENERIC_IRQ_PROBE
+ def_bool n
+
+config GENERIC_PENDING_IRQ
+ def_bool n
+
+config AUTO_IRQ_AFFINITY
+ def_bool n
+
+config IRQ_PER_CPU
+ def_bool n
+
+config HARDIRQS_SW_RESEND
+ def_bool n
+
+config SPARSE_IRQ
+ bool "Support sparse irq numbering"
+ depends on HAVE_SPARSE_IRQ
+ ---help---
+
+ Sparse irq numbering is useful for distro kernels that want
+ to define a high CONFIG_NR_CPUS value but still want to have
+ low kernel memory footprint on smaller machines.
+
+ ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+ out the interrupt descriptors in a more NUMA-friendly way. )
+
+ If you don't know what to do here, say N.
+
+endmenu
+endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419..54329cd7b3e 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef11..505798f86c3 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
* Some chips need to know about probing in
* progress:
*/
- if (desc->chip->set_type)
- desc->chip->set_type(i, IRQ_TYPE_PROBE);
- desc->chip->startup(i);
+ if (desc->irq_data.chip->irq_set_type)
+ desc->irq_data.chip->irq_set_type(&desc->irq_data,
+ IRQ_TYPE_PROBE);
+ desc->irq_data.chip->irq_startup(&desc->irq_data);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(&desc->lock);
if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
- if (desc->chip->startup(i))
+ if (desc->irq_data.chip->irq_startup(&desc->irq_data))
desc->status |= IRQ_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
/* It triggered already - consider it spurious. */
if (!(status & IRQ_WAITING)) {
desc->status = status & ~IRQ_AUTODETECT;
- desc->chip->shutdown(i);
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
} else
if (i < 32)
mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
mask |= 1 << i;
desc->status = status & ~IRQ_AUTODETECT;
- desc->chip->shutdown(i);
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
nr_of_irqs++;
}
desc->status = status & ~IRQ_AUTODETECT;
- desc->chip->shutdown(i);
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f..baa5c4acad8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
#include "internals.h"
-static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
-{
- struct irq_desc *desc;
- unsigned long flags;
-
- desc = irq_to_desc(irq);
- if (!desc) {
- WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
- return;
- }
-
- /* Ensure we don't have left over values from a previous use of this irq */
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->status = IRQ_DISABLED;
- desc->chip = &no_irq_chip;
- desc->handle_irq = handle_bad_irq;
- desc->depth = 1;
- desc->msi_desc = NULL;
- desc->handler_data = NULL;
- if (!keep_chip_data)
- desc->chip_data = NULL;
- desc->action = NULL;
- desc->irq_count = 0;
- desc->irqs_unhandled = 0;
-#ifdef CONFIG_SMP
- cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- cpumask_clear(desc->pending_mask);
-#endif
-#endif
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-/**
- * dynamic_irq_init - initialize a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
-{
- dynamic_irq_init_x(irq, false);
-}
-
-/**
- * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
- * @irq: irq number to initialize
- *
- * does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_init_keep_chip_data(unsigned int irq)
-{
- dynamic_irq_init_x(irq, true);
-}
-
-static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- if (!desc) {
- WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
- return;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (desc->action) {
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
- irq);
- return;
- }
- desc->msi_desc = NULL;
- desc->handler_data = NULL;
- if (!keep_chip_data)
- desc->chip_data = NULL;
- desc->handle_irq = handle_bad_irq;
- desc->chip = &no_irq_chip;
- desc->name = NULL;
- clear_kstat_irqs(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
- dynamic_irq_cleanup_x(irq, false);
-}
-
-/**
- * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
- * @irq: irq number to initialize
- *
- * does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
-{
- dynamic_irq_cleanup_x(irq, true);
-}
-
-
/**
* set_irq_chip - set the irq chip for an irq
* @irq: irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
raw_spin_lock_irqsave(&desc->lock, flags);
irq_chip_set_defaults(chip);
- desc->chip = chip;
+ desc->irq_data.chip = chip;
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
}
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->handler_data = data;
+ desc->irq_data.handler_data = data;
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
}
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->msi_desc = entry;
+ desc->irq_data.msi_desc = entry;
if (entry)
entry->irq = irq;
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
return -EINVAL;
}
- if (!desc->chip) {
+ if (!desc->irq_data.chip) {
printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
return -EINVAL;
}
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->chip_data = data;
+ desc->irq_data.chip_data = data;
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(set_irq_chip_data);
+struct irq_data *irq_get_irq_data(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc ? &desc->irq_data : NULL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
+
/**
* set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
*
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
/*
* default enable function
*/
-static void default_enable(unsigned int irq)
+static void default_enable(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_data_to_desc(data);
- desc->chip->unmask(irq);
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
desc->status &= ~IRQ_MASKED;
}
/*
* default disable function
*/
-static void default_disable(unsigned int irq)
+static void default_disable(struct irq_data *data)
{
}
/*
* default startup function
*/
-static unsigned int default_startup(unsigned int irq)
+static unsigned int default_startup(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_data_to_desc(data);
- desc->chip->enable(irq);
+ desc->irq_data.chip->irq_enable(data);
return 0;
}
/*
* default shutdown function
*/
-static void default_shutdown(unsigned int irq)
+static void default_shutdown(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_data_to_desc(data);
- desc->chip->mask(irq);
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
desc->status |= IRQ_MASKED;
}
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+/* Temporary migration helpers */
+static void compat_irq_mask(struct irq_data *data)
+{
+ data->chip->mask(data->irq);
+}
+
+static void compat_irq_unmask(struct irq_data *data)
+{
+ data->chip->unmask(data->irq);
+}
+
+static void compat_irq_ack(struct irq_data *data)
+{
+ data->chip->ack(data->irq);
+}
+
+static void compat_irq_mask_ack(struct irq_data *data)
+{
+ data->chip->mask_ack(data->irq);
+}
+
+static void compat_irq_eoi(struct irq_data *data)
+{
+ data->chip->eoi(data->irq);
+}
+
+static void compat_irq_enable(struct irq_data *data)
+{
+ data->chip->enable(data->irq);
+}
+
+static void compat_irq_disable(struct irq_data *data)
+{
+ data->chip->disable(data->irq);
+}
+
+static void compat_irq_shutdown(struct irq_data *data)
+{
+ data->chip->shutdown(data->irq);
+}
+
+static unsigned int compat_irq_startup(struct irq_data *data)
+{
+ return data->chip->startup(data->irq);
+}
+
+static int compat_irq_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ return data->chip->set_affinity(data->irq, dest);
+}
+
+static int compat_irq_set_type(struct irq_data *data, unsigned int type)
+{
+ return data->chip->set_type(data->irq, type);
+}
+
+static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+ return data->chip->set_wake(data->irq, on);
+}
+
+static int compat_irq_retrigger(struct irq_data *data)
+{
+ return data->chip->retrigger(data->irq);
+}
+
+static void compat_bus_lock(struct irq_data *data)
+{
+ data->chip->bus_lock(data->irq);
+}
+
+static void compat_bus_sync_unlock(struct irq_data *data)
+{
+ data->chip->bus_sync_unlock(data->irq);
+}
+#endif
+
/*
* Fixup enable/disable function pointers
*/
void irq_chip_set_defaults(struct irq_chip *chip)
{
- if (!chip->enable)
- chip->enable = default_enable;
- if (!chip->disable)
- chip->disable = default_disable;
- if (!chip->startup)
- chip->startup = default_startup;
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
/*
- * We use chip->disable, when the user provided its own. When
- * we have default_disable set for chip->disable, then we need
+ * Compat fixup functions need to be before we set the
+ * defaults for enable/disable/startup/shutdown
+ */
+ if (chip->enable)
+ chip->irq_enable = compat_irq_enable;
+ if (chip->disable)
+ chip->irq_disable = compat_irq_disable;
+ if (chip->shutdown)
+ chip->irq_shutdown = compat_irq_shutdown;
+ if (chip->startup)
+ chip->irq_startup = compat_irq_startup;
+#endif
+ /*
+ * The real defaults
+ */
+ if (!chip->irq_enable)
+ chip->irq_enable = default_enable;
+ if (!chip->irq_disable)
+ chip->irq_disable = default_disable;
+ if (!chip->irq_startup)
+ chip->irq_startup = default_startup;
+ /*
+ * We use chip->irq_disable, when the user provided its own. When
+ * we have default_disable set for chip->irq_disable, then we need
* to use default_shutdown, otherwise the irq line is not
* disabled on free_irq():
*/
- if (!chip->shutdown)
- chip->shutdown = chip->disable != default_disable ?
- chip->disable : default_shutdown;
- if (!chip->name)
- chip->name = chip->typename;
+ if (!chip->irq_shutdown)
+ chip->irq_shutdown = chip->irq_disable != default_disable ?
+ chip->irq_disable : default_shutdown;
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
if (!chip->end)
chip->end = dummy_irq_chip.end;
+
+ /*
+ * Now fix up the remaining compat handlers
+ */
+ if (chip->bus_lock)
+ chip->irq_bus_lock = compat_bus_lock;
+ if (chip->bus_sync_unlock)
+ chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
+ if (chip->mask)
+ chip->irq_mask = compat_irq_mask;
+ if (chip->unmask)
+ chip->irq_unmask = compat_irq_unmask;
+ if (chip->ack)
+ chip->irq_ack = compat_irq_ack;
+ if (chip->mask_ack)
+ chip->irq_mask_ack = compat_irq_mask_ack;
+ if (chip->eoi)
+ chip->irq_eoi = compat_irq_eoi;
+ if (chip->set_affinity)
+ chip->irq_set_affinity = compat_irq_set_affinity;
+ if (chip->set_type)
+ chip->irq_set_type = compat_irq_set_type;
+ if (chip->set_wake)
+ chip->irq_set_wake = compat_irq_set_wake;
+ if (chip->retrigger)
+ chip->irq_retrigger = compat_irq_retrigger;
+#endif
}
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc *desc)
{
- if (desc->chip->mask_ack)
- desc->chip->mask_ack(irq);
+ if (desc->irq_data.chip->irq_mask_ack)
+ desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
else {
- desc->chip->mask(irq);
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ if (desc->irq_data.chip->irq_ack)
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
}
desc->status |= IRQ_MASKED;
}
-static inline void mask_irq(struct irq_desc *desc, int irq)
+static inline void mask_irq(struct irq_desc *desc)
{
- if (desc->chip->mask) {
- desc->chip->mask(irq);
+ if (desc->irq_data.chip->irq_mask) {
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
desc->status |= IRQ_MASKED;
}
}
-static inline void unmask_irq(struct irq_desc *desc, int irq)
+static inline void unmask_irq(struct irq_desc *desc)
{
- if (desc->chip->unmask) {
- desc->chip->unmask(irq);
+ if (desc->irq_data.chip->irq_unmask) {
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
desc->status &= ~IRQ_MASKED;
}
}
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
irqreturn_t action_ret;
raw_spin_lock(&desc->lock);
- mask_ack_irq(desc, irq);
+ mask_ack_irq(desc);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
desc->status &= ~IRQ_INPROGRESS;
if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
- unmask_irq(desc, irq);
+ unmask_irq(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
desc->status |= IRQ_PENDING;
- mask_irq(desc, irq);
+ mask_irq(desc);
goto out;
}
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
out:
- desc->chip->eoi(irq);
+ desc->irq_data.chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
!desc->action)) {
desc->status |= (IRQ_PENDING | IRQ_MASKED);
- mask_ack_irq(desc, irq);
+ mask_ack_irq(desc);
goto out_unlock;
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
/* Mark the IRQ currently in progress.*/
desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
irqreturn_t action_ret;
if (unlikely(!action)) {
- mask_irq(desc, irq);
+ mask_irq(desc);
goto out_unlock;
}
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
if (unlikely((desc->status &
(IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
(IRQ_PENDING | IRQ_MASKED))) {
- unmask_irq(desc, irq);
+ unmask_irq(desc);
}
desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
kstat_incr_irqs_this_cpu(irq, desc);
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ if (desc->irq_data.chip->irq_ack)
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
- if (desc->chip->eoi)
- desc->chip->eoi(irq);
+ if (desc->irq_data.chip->irq_eoi)
+ desc->irq_data.chip->irq_eoi(&desc->irq_data);
}
void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
if (!handle)
handle = handle_bad_irq;
- else if (desc->chip == &no_irq_chip) {
+ else if (desc->irq_data.chip == &no_irq_chip) {
printk(KERN_WARNING "Trying to install %sinterrupt handler "
"for IRQ%d\n", is_chained ? "chained " : "", irq);
/*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
* prevent us to setup the interrupt at all. Switch it to
* dummy_irq_chip for easy transition.
*/
- desc->chip = &dummy_irq_chip;
+ desc->irq_data.chip = &dummy_irq_chip;
}
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
/* Uninstall? */
if (handle == handle_bad_irq) {
- if (desc->chip != &no_irq_chip)
- mask_ack_irq(desc, irq);
+ if (desc->irq_data.chip != &no_irq_chip)
+ mask_ack_irq(desc);
desc->status |= IRQ_DISABLED;
desc->depth = 1;
}
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->status &= ~IRQ_DISABLED;
desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
desc->depth = 0;
- desc->chip->startup(irq);
+ desc->irq_data.chip->irq_startup(&desc->irq_data);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
__set_irq_handler(irq, handle, 0, name);
}
-void set_irq_noprobe(unsigned int irq)
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- if (!desc) {
- printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+ if (!desc)
return;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->status |= IRQ_NOPROBE;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-void set_irq_probe(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- if (!desc) {
- printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
- return;
- }
+ /* Sanitize flags */
+ set &= IRQF_MODIFY_MASK;
+ clr &= IRQF_MODIFY_MASK;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->status &= ~IRQ_NOPROBE;
+ desc->status &= ~clr;
+ desc->status |= set;
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 00000000000..20dc5474947
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+
+#include "internals.h"
+
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ print_irq_desc(data->irq, desc);
+ ack_bad_irq(data->irq);
+}
+
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+
+static unsigned int noop_ret(struct irq_data *data)
+{
+ return 0;
+}
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static void compat_noop(unsigned int irq) { }
+#define END_INIT .end = compat_noop
+#else
+#define END_INIT
+#endif
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+ .name = "none",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = ack_bad,
+ END_INIT
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+ .name = "dummy",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ END_INIT
+};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c691122..e2347eb6330 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
*/
#include <linux/irq.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/module.h>
#include <linux/random.h>
+#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/rculist.h>
-#include <linux/hash.h>
-#include <linux/radix-tree.h>
+
#include <trace/events/irq.h>
#include "internals.h"
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-struct lock_class_key irq_desc_lock_class;
-
/**
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
ack_bad_irq(irq);
}
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-static void __init init_irq_default_affinity(void)
-{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
-}
-#else
-static void __init init_irq_default_affinity(void)
-{
-}
-#endif
-
-/*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the appropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
- */
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
-
-#ifdef CONFIG_SPARSE_IRQ
-
-static struct irq_desc irq_desc_init = {
- .irq = -1,
- .status = IRQ_DISABLED,
- .chip = &no_irq_chip,
- .handle_irq = handle_bad_irq,
- .depth = 1,
- .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-};
-
-void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
-{
- void *ptr;
-
- ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
- GFP_ATOMIC, node);
-
- /*
- * don't overwite if can not get new one
- * init_copy_kstat_irqs() could still use old one
- */
- if (ptr) {
- printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
- desc->kstat_irqs = ptr;
- }
-}
-
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
-{
- memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-
- raw_spin_lock_init(&desc->lock);
- desc->irq = irq;
-#ifdef CONFIG_SMP
- desc->node = node;
-#endif
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_kstat_irqs(desc, node, nr_cpu_ids);
- if (!desc->kstat_irqs) {
- printk(KERN_ERR "can not alloc kstat_irqs\n");
- BUG_ON(1);
- }
- if (!alloc_desc_masks(desc, node, false)) {
- printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
- BUG_ON(1);
- }
- init_desc_masks(desc);
- arch_init_chip_data(desc, node);
-}
-
-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-
-static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
-
-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
- radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
- return radix_tree_lookup(&irq_desc_tree, irq);
-}
-
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
- void **ptr;
-
- ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
- if (ptr)
- radix_tree_replace_slot(ptr, desc);
-}
-
-static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
- [0 ... NR_IRQS_LEGACY-1] = {
- .irq = -1,
- .status = IRQ_DISABLED,
- .chip = &no_irq_chip,
- .handle_irq = handle_bad_irq,
- .depth = 1,
- .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
- }
-};
-
-static unsigned int *kstat_irqs_legacy;
-
-int __init early_irq_init(void)
-{
- struct irq_desc *desc;
- int legacy_count;
- int node;
- int i;
-
- init_irq_default_affinity();
-
- /* initialize nr_irqs based on nr_cpu_ids */
- arch_probe_nr_irqs();
- printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
-
- desc = irq_desc_legacy;
- legacy_count = ARRAY_SIZE(irq_desc_legacy);
- node = first_online_node;
-
- /* allocate based on nr_cpu_ids */
- kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
- sizeof(int), GFP_NOWAIT, node);
-
- for (i = 0; i < legacy_count; i++) {
- desc[i].irq = i;
-#ifdef CONFIG_SMP
- desc[i].node = node;
-#endif
- desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
- lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- alloc_desc_masks(&desc[i], node, true);
- init_desc_masks(&desc[i]);
- set_irq_desc(i, &desc[i]);
- }
-
- return arch_early_irq_init();
-}
-
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
- struct irq_desc *desc;
- unsigned long flags;
-
- if (irq >= nr_irqs) {
- WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
- irq, nr_irqs);
- return NULL;
- }
-
- desc = irq_to_desc(irq);
- if (desc)
- return desc;
-
- raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-
- /* We have to check it to avoid races with another CPU */
- desc = irq_to_desc(irq);
- if (desc)
- goto out_unlock;
-
- desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-
- printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
- if (!desc) {
- printk(KERN_ERR "can not alloc irq_desc\n");
- BUG_ON(1);
- }
- init_one_irq_desc(irq, desc, node);
-
- set_irq_desc(irq, desc);
-
-out_unlock:
- raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
- return desc;
-}
-
-#else /* !CONFIG_SPARSE_IRQ */
-
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
- [0 ... NR_IRQS-1] = {
- .status = IRQ_DISABLED,
- .chip = &no_irq_chip,
- .handle_irq = handle_bad_irq,
- .depth = 1,
- .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
- }
-};
-
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
-int __init early_irq_init(void)
-{
- struct irq_desc *desc;
- int count;
- int i;
-
- init_irq_default_affinity();
-
- printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
-
- desc = irq_desc;
- count = ARRAY_SIZE(irq_desc);
-
- for (i = 0; i < count; i++) {
- desc[i].irq = i;
- alloc_desc_masks(&desc[i], 0, true);
- init_desc_masks(&desc[i]);
- desc[i].kstat_irqs = kstat_irqs_all[i];
- }
- return arch_early_irq_init();
-}
-
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
- return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
- return irq_to_desc(irq);
-}
-#endif /* !CONFIG_SPARSE_IRQ */
-
-void clear_kstat_irqs(struct irq_desc *desc)
-{
- memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
-}
-
-/*
- * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
- */
-static void ack_bad(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- print_irq_desc(irq, desc);
- ack_bad_irq(irq);
-}
-
-/*
- * NOP functions
- */
-static void noop(unsigned int irq)
-{
-}
-
-static unsigned int noop_ret(unsigned int irq)
-{
- return 0;
-}
-
-/*
- * Generic no controller implementation
- */
-struct irq_chip no_irq_chip = {
- .name = "none",
- .startup = noop_ret,
- .shutdown = noop,
- .enable = noop,
- .disable = noop,
- .ack = ack_bad,
- .end = noop,
-};
-
-/*
- * Generic dummy implementation which can be used for
- * real dumb interrupt sources
- */
-struct irq_chip dummy_irq_chip = {
- .name = "dummy",
- .startup = noop_ret,
- .shutdown = noop,
- .enable = noop,
- .disable = noop,
- .ack = noop,
- .mask = noop,
- .unmask = noop,
- .end = noop,
-};
-
/*
* Special, empty irq handler:
*/
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
/*
* No locking required for CPU-local interrupts:
*/
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ if (desc->irq_data.chip->ack)
+ desc->irq_data.chip->ack(irq);
if (likely(!(desc->status & IRQ_DISABLED))) {
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
}
- desc->chip->end(irq);
+ desc->irq_data.chip->end(irq);
return 1;
}
raw_spin_lock(&desc->lock);
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ if (desc->irq_data.chip->ack)
+ desc->irq_data.chip->ack(irq);
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
* The ->end() handler has to deal with interrupts which got
* disabled while the handler was running.
*/
- desc->chip->end(irq);
+ desc->irq_data.chip->end(irq);
raw_spin_unlock(&desc->lock);
return 1;
}
#endif
-
-void early_init_irq_lock_class(void)
-{
- struct irq_desc *desc;
- int i;
-
- for_each_irq_desc(i, desc) {
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- }
-}
-
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- return desc ? desc->kstat_irqs[cpu] : 0;
-}
-EXPORT_SYMBOL(kstat_irqs_cpu);
-
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0..4571ae7e085 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
/*
* IRQ subsystem internal functions and variables:
*/
+#include <linux/irqdesc.h>
extern int noirqdebug;
+#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+
/* Set default functions for irq_chip structures: */
extern void irq_chip_set_defaults(struct irq_chip *chip);
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern struct lock_class_key irq_desc_lock_class;
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-extern void clear_kstat_irqs(struct irq_desc *desc);
-extern raw_spinlock_t sparse_irq_lock;
-#ifdef CONFIG_SPARSE_IRQ
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-#endif
+/* Resending of interrupts :*/
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static inline void irq_end(unsigned int irq, struct irq_desc *desc)
+{
+ if (desc->irq_data.chip && desc->irq_data.chip->end)
+ desc->irq_data.chip->end(irq);
+}
+#else
+static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
+#endif
+
/* Inline functions for support of irq chips on slow busses */
-static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_lock(struct irq_desc *desc)
{
- if (unlikely(desc->chip->bus_lock))
- desc->chip->bus_lock(irq);
+ if (unlikely(desc->irq_data.chip->irq_bus_lock))
+ desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
}
-static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
{
- if (unlikely(desc->chip->bus_sync_unlock))
- desc->chip->bus_sync_unlock(irq);
+ if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
+ desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}
/*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
printk("->handle_irq(): %p, ", desc->handle_irq);
print_symbol("%s\n", (unsigned long)desc->handle_irq);
- printk("->chip(): %p, ", desc->chip);
- print_symbol("%s\n", (unsigned long)desc->chip);
+ printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+ print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
printk("->action(): %p\n", desc->action);
if (desc->action) {
printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 00000000000..9d917ff7267
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+
+#include "internals.h"
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+ if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+ return -ENOMEM;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+ free_cpumask_var(desc->irq_data.affinity);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+ desc->irq_data.node = node;
+ cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_clear(desc->pending_mask);
+#endif
+}
+
+static inline int desc_node(struct irq_desc *desc)
+{
+ return desc->irq_data.node;
+}
+
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+ desc->irq_data.irq = irq;
+ desc->irq_data.chip = &no_irq_chip;
+ desc->irq_data.chip_data = NULL;
+ desc->irq_data.handler_data = NULL;
+ desc->irq_data.msi_desc = NULL;
+ desc->status = IRQ_DEFAULT_INIT_FLAGS;
+ desc->handle_irq = handle_bad_irq;
+ desc->depth = 1;
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+ desc->name = NULL;
+ memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ desc_smp_init(desc, node);
+}
+
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+
+#ifdef CONFIG_SPARSE_IRQ
+
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return radix_tree_lookup(&irq_desc_tree, irq);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+ radix_tree_delete(&irq_desc_tree, irq);
+}
+
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+ struct irq_desc *desc;
+ gfp_t gfp = GFP_KERNEL;
+
+ desc = kzalloc_node(sizeof(*desc), gfp, node);
+ if (!desc)
+ return NULL;
+ /* allocate based on nr_cpu_ids */
+ desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+ gfp, node);
+ if (!desc->kstat_irqs)
+ goto err_desc;
+
+ if (alloc_masks(desc, gfp, node))
+ goto err_kstat;
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+
+ desc_set_defaults(irq, desc, node);
+
+ return desc;
+
+err_kstat:
+ kfree(desc->kstat_irqs);
+err_desc:
+ kfree(desc);
+ return NULL;
+}
+
+static void free_desc(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unregister_irq_proc(irq, desc);
+
+ mutex_lock(&sparse_irq_lock);
+ delete_irq_desc(irq);
+ mutex_unlock(&sparse_irq_lock);
+
+ free_masks(desc);
+ kfree(desc->kstat_irqs);
+ kfree(desc);
+}
+
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ desc = alloc_desc(start + i, node);
+ if (!desc)
+ goto err;
+ mutex_lock(&sparse_irq_lock);
+ irq_insert_desc(start + i, desc);
+ mutex_unlock(&sparse_irq_lock);
+ }
+ return start;
+
+err:
+ for (i--; i >= 0; i--)
+ free_desc(start + i);
+
+ mutex_lock(&sparse_irq_lock);
+ bitmap_clear(allocated_irqs, start, cnt);
+ mutex_unlock(&sparse_irq_lock);
+ return -ENOMEM;
+}
+
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+ int res = irq_alloc_descs(irq, irq, 1, node);
+
+ if (res == -EEXIST || res == irq)
+ return irq_to_desc(irq);
+ return NULL;
+}
+
+int __init early_irq_init(void)
+{
+ int i, initcnt, node = first_online_node;
+ struct irq_desc *desc;
+
+ init_irq_default_affinity();
+
+ /* Let arch update nr_irqs and return the nr of preallocated irqs */
+ initcnt = arch_probe_nr_irqs();
+ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+
+ for (i = 0; i < initcnt; i++) {
+ desc = alloc_desc(i, node);
+ set_bit(i, allocated_irqs);
+ irq_insert_desc(i, desc);
+ }
+ return arch_early_irq_init();
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+ [0 ... NR_IRQS-1] = {
+ .status = IRQ_DEFAULT_INIT_FLAGS,
+ .handle_irq = handle_bad_irq,
+ .depth = 1,
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+ }
+};
+
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
+int __init early_irq_init(void)
+{
+ int count, i, node = first_online_node;
+ struct irq_desc *desc;
+
+ init_irq_default_affinity();
+
+ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
+ desc = irq_desc;
+ count = ARRAY_SIZE(irq_desc);
+
+ for (i = 0; i < count; i++) {
+ desc[i].irq_data.irq = i;
+ desc[i].irq_data.chip = &no_irq_chip;
+ desc[i].kstat_irqs = kstat_irqs_all[i];
+ alloc_masks(desc + i, GFP_KERNEL, node);
+ desc_smp_init(desc + i, node);
+ lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ }
+ return arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+ return irq_to_desc(irq);
+}
+
+static void free_desc(unsigned int irq)
+{
+ dynamic_irq_cleanup(irq);
+}
+
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+ return start;
+}
+#endif /* !CONFIG_SPARSE_IRQ */
+
+/* Dynamic interrupt handling */
+
+/**
+ * irq_free_descs - free irq descriptors
+ * @from: Start of descriptor range
+ * @cnt: Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+ int i;
+
+ if (from >= nr_irqs || (from + cnt) > nr_irqs)
+ return;
+
+ for (i = 0; i < cnt; i++)
+ free_desc(from + i);
+
+ mutex_lock(&sparse_irq_lock);
+ bitmap_clear(allocated_irqs, from, cnt);
+ mutex_unlock(&sparse_irq_lock);
+}
+
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq: Allocate for specific irq number if irq >= 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate.
+ * @node: Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+{
+ int start, ret;
+
+ if (!cnt)
+ return -EINVAL;
+
+ mutex_lock(&sparse_irq_lock);
+
+ start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+ ret = -EEXIST;
+ if (irq >=0 && start != irq)
+ goto err;
+
+ ret = -ENOMEM;
+ if (start >= nr_irqs)
+ goto err;
+
+ bitmap_set(allocated_irqs, start, cnt);
+ mutex_unlock(&sparse_irq_lock);
+ return alloc_descs(start, cnt, node);
+
+err:
+ mutex_unlock(&sparse_irq_lock);
+ return ret;
+}
+
+/**
+ * irq_reserve_irqs - mark irqs allocated
+ * @from: mark from irq number
+ * @cnt: number of irqs to mark
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+{
+ unsigned int start;
+ int ret = 0;
+
+ if (!cnt || (from + cnt) > nr_irqs)
+ return -EINVAL;
+
+ mutex_lock(&sparse_irq_lock);
+ start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+ if (start == from)
+ bitmap_set(allocated_irqs, start, cnt);
+ else
+ ret = -EEXIST;
+ mutex_unlock(&sparse_irq_lock);
+ return ret;
+}
+
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset: where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+ return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc_set_defaults(irq, desc, desc_node(desc));
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ return desc ? desc->kstat_irqs[cpu] : 0;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a..644e8d5fa36 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
- !desc->chip->set_affinity)
+ if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
+ !desc->irq_data.chip->irq_set_affinity)
return 0;
return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip = desc->irq_data.chip;
unsigned long flags;
- if (!desc->chip->set_affinity)
+ if (!chip->irq_set_affinity)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT) {
- if (!desc->chip->set_affinity(irq, cpumask)) {
- cpumask_copy(desc->affinity, cpumask);
+ if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
+ cpumask_copy(desc->irq_data.affinity, cpumask);
irq_set_thread_affinity(desc);
}
}
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
cpumask_copy(desc->pending_mask, cpumask);
}
#else
- if (!desc->chip->set_affinity(irq, cpumask)) {
- cpumask_copy(desc->affinity, cpumask);
+ if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
+ cpumask_copy(desc->irq_data.affinity, cpumask);
irq_set_thread_affinity(desc);
}
#endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(desc->affinity, cpu_online_mask)
+ if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
< nr_cpu_ids)
goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+ cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
set_affinity:
- desc->chip->set_affinity(irq, desc->affinity);
+ desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
return 0;
}
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
if (!desc->depth++) {
desc->status |= IRQ_DISABLED;
- desc->chip->disable(irq);
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
}
}
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
if (!desc)
return;
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
__disable_irq(desc, irq, false);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
* IRQ line is re-enabled.
*
* This function may be called from IRQ context only when
- * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
if (!desc)
return;
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL(enable_irq);
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
struct irq_desc *desc = irq_to_desc(irq);
int ret = -ENXIO;
- if (desc->chip->set_wake)
- ret = desc->chip->set_wake(irq, on);
+ if (desc->irq_data.chip->irq_set_wake)
+ ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
return ret;
}
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
}
int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags)
+ unsigned long flags)
{
int ret;
- struct irq_chip *chip = desc->chip;
+ struct irq_chip *chip = desc->irq_data.chip;
- if (!chip || !chip->set_type) {
+ if (!chip || !chip->irq_set_type) {
/*
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
}
/* caller masked out all except trigger mode flags */
- ret = chip->set_type(irq, flags);
+ ret = chip->irq_set_type(&desc->irq_data, flags);
if (ret)
- pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
- (int)flags, irq, chip->set_type);
+ pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+ flags, irq, chip->irq_set_type);
else {
if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
desc->status |= flags;
- if (chip != desc->chip)
- irq_chip_set_defaults(desc->chip);
+ if (chip != desc->irq_data.chip)
+ irq_chip_set_defaults(desc->irq_data.chip);
}
return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
{
again:
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
raw_spin_lock_irq(&desc->lock);
/*
@@ -521,17 +522,17 @@ again:
*/
if (unlikely(desc->status & IRQ_INPROGRESS)) {
raw_spin_unlock_irq(&desc->lock);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
cpu_relax();
goto again;
}
if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
desc->status &= ~IRQ_MASKED;
- desc->chip->unmask(irq);
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
}
raw_spin_unlock_irq(&desc->lock);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
}
#ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
}
raw_spin_lock_irq(&desc->lock);
- cpumask_copy(mask, desc->affinity);
+ cpumask_copy(mask, desc->irq_data.affinity);
raw_spin_unlock_irq(&desc->lock);
set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!desc)
return -EINVAL;
- if (desc->chip == &no_irq_chip)
+ if (desc->irq_data.chip == &no_irq_chip)
return -ENOSYS;
/*
* Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- irq_chip_set_defaults(desc->chip);
+ irq_chip_set_defaults(desc->irq_data.chip);
init_waitqueue_head(&desc->wait_for_threads);
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
desc->status &= ~IRQ_DISABLED;
- desc->chip->startup(irq);
+ desc->irq_data.chip->irq_startup(&desc->irq_data);
} else
/* Undo nested disables: */
desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Currently used only by UML, might disappear one day: */
#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->chip->release)
- desc->chip->release(irq, dev_id);
+ if (desc->irq_data.chip->release)
+ desc->irq_data.chip->release(irq, dev_id);
#endif
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
desc->status |= IRQ_DISABLED;
- if (desc->chip->shutdown)
- desc->chip->shutdown(irq);
+ if (desc->irq_data.chip->irq_shutdown)
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
else
- desc->chip->disable(irq);
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
}
#ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
if (!desc)
return;
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
kfree(__free_irq(irq, dev_id));
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL(free_irq);
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
- chip_bus_lock(irq, desc);
+ chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(irq, desc);
+ chip_bus_sync_unlock(desc);
if (retval)
kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 24196228083..1d254194048 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip = desc->irq_data.chip;
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
- if (!desc->chip->set_affinity)
+ if (!chip->irq_set_affinity)
return;
assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
*/
if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
< nr_cpu_ids))
- if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
- cpumask_copy(desc->affinity, desc->pending_mask);
+ if (!chip->irq_set_affinity(&desc->irq_data,
+ desc->pending_mask, false)) {
+ cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
irq_set_thread_affinity(desc);
}
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
if (unlikely(desc->status & IRQ_DISABLED))
return;
- desc->chip->mask(irq);
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
move_masked_irq(irq);
- desc->chip->unmask(irq);
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
}
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665a..00000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NUMA irq-desc migration code
- *
- * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
- * the new "home node" of the IRQ.
- */
-
-#include <linux/irq.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-
-#include "internals.h"
-
-static void init_copy_kstat_irqs(struct irq_desc *old_desc,
- struct irq_desc *desc,
- int node, int nr)
-{
- init_kstat_irqs(desc, node, nr);
-
- if (desc->kstat_irqs != old_desc->kstat_irqs)
- memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
- nr * sizeof(*desc->kstat_irqs));
-}
-
-static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-{
- if (old_desc->kstat_irqs == desc->kstat_irqs)
- return;
-
- kfree(old_desc->kstat_irqs);
- old_desc->kstat_irqs = NULL;
-}
-
-static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
-{
- memcpy(desc, old_desc, sizeof(struct irq_desc));
- if (!alloc_desc_masks(desc, node, false)) {
- printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
- "for migration.\n", irq);
- return false;
- }
- raw_spin_lock_init(&desc->lock);
- desc->node = node;
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
- init_copy_desc_masks(old_desc, desc);
- arch_init_copy_chip_data(old_desc, desc, node);
- return true;
-}
-
-static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
-{
- free_kstat_irqs(old_desc, desc);
- free_desc_masks(old_desc, desc);
- arch_free_chip_data(old_desc, desc);
-}
-
-static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
- int node)
-{
- struct irq_desc *desc;
- unsigned int irq;
- unsigned long flags;
-
- irq = old_desc->irq;
-
- raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-
- /* We have to check it to avoid races with another CPU */
- desc = irq_to_desc(irq);
-
- if (desc && old_desc != desc)
- goto out_unlock;
-
- desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
- if (!desc) {
- printk(KERN_ERR "irq %d: can not get new irq_desc "
- "for migration.\n", irq);
- /* still use old one */
- desc = old_desc;
- goto out_unlock;
- }
- if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
- /* still use old one */
- kfree(desc);
- desc = old_desc;
- goto out_unlock;
- }
-
- replace_irq_desc(irq, desc);
- raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
- /* free the old one */
- free_one_irq_desc(old_desc, desc);
- kfree(old_desc);
-
- return desc;
-
-out_unlock:
- raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
- return desc;
-}
-
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
- /* those static or target node is -1, do not move them */
- if (desc->irq < NR_IRQS_LEGACY || node == -1)
- return desc;
-
- if (desc->node != node)
- desc = __real_move_irq_desc(desc, node);
-
- return desc;
-}
-
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd..01b1d3a8898 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- const struct cpumask *mask = desc->affinity;
+ const struct cpumask *mask = desc->irq_data.affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+ if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
irq_balancing_disabled(irq))
return -EIO;
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
- seq_printf(m, "%d\n", desc->node);
+ seq_printf(m, "%d\n", desc->irq_data.node);
return 0;
}
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
char name [MAX_NAMELEN];
- if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+ if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
return;
memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
&irq_spurious_proc_fops, (void *)(long)irq);
}
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+ char name [MAX_NAMELEN];
+
+ if (!root_irq_dir || !desc->dir)
+ return;
+#ifdef CONFIG_SMP
+ remove_proc_entry("smp_affinity", desc->dir);
+ remove_proc_entry("affinity_hint", desc->dir);
+ remove_proc_entry("node", desc->dir);
+#endif
+ remove_proc_entry("spurious", desc->dir);
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%u", irq);
+ remove_proc_entry(name, root_irq_dir);
+}
+
#undef MAX_NAMELEN
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a..891115a929a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
/*
* Make sure the interrupt is enabled, before resending it:
*/
- desc->chip->enable(irq);
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
/*
* We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
- if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
+ if (!desc->irq_data.chip->irq_retrigger ||
+ !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534..3089d3b9d5f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
#include <linux/moduleparam.h>
#include <linux/timer.h>
+#include "internals.h"
+
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
* If we did actual work for the real IRQ line we must let the
* IRQ controller clean up too
*/
- if (work && desc->chip && desc->chip->end)
- desc->chip->end(irq);
+ if (work)
+ irq_end(irq, desc);
raw_spin_unlock(&desc->lock);
return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
desc->depth++;
- desc->chip->disable(irq);
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 00000000000..f16763ff848
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free NULL, 0 -> {claimed} : free to be used
+ * claimed NULL, 3 -> {pending} : claimed to be enqueued
+ * pending next, 3 -> {busy} : queued, pending callback
+ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING 1UL
+#define IRQ_WORK_BUSY 2UL
+#define IRQ_WORK_FLAGS 3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+ return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+ unsigned long next = (unsigned long)entry->next;
+ next &= ~IRQ_WORK_FLAGS;
+ return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+ unsigned long next = (unsigned long)entry;
+ next |= flags;
+ return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+ struct irq_work *next, *nflags;
+
+ do {
+ next = entry->next;
+ if ((unsigned long)next & IRQ_WORK_PENDING)
+ return false;
+ nflags = next_flags(next, IRQ_WORK_FLAGS);
+ } while (cmpxchg(&entry->next, next, nflags) != next);
+
+ return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+ /*
+ * Lame architectures will get the timer tick callback
+ */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+ struct irq_work **head, *next;
+
+ head = &get_cpu_var(irq_work_list);
+
+ do {
+ next = *head;
+ /* Can assign non-atomic because we keep the flags set. */
+ entry->next = next_flags(next, IRQ_WORK_FLAGS);
+ } while (cmpxchg(head, next, entry) != next);
+
+ /* The list was empty, raise self-interrupt to start processing. */
+ if (!irq_work_next(entry))
+ arch_irq_work_raise();
+
+ put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+ if (!irq_work_claim(entry)) {
+ /*
+ * Already enqueued, can't do!
+ */
+ return false;
+ }
+
+ __irq_work_queue(entry);
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+ struct irq_work *list, **head;
+
+ head = &__get_cpu_var(irq_work_list);
+ if (*head == NULL)
+ return;
+
+ BUG_ON(!in_irq());
+ BUG_ON(!irqs_disabled());
+
+ list = xchg(head, NULL);
+ while (list != NULL) {
+ struct irq_work *entry = list;
+
+ list = irq_work_next(list);
+
+ /*
+ * Clear the PENDING bit, after this point the @entry
+ * can be re-used.
+ */
+ entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+ entry->func(entry);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+ WARN_ON_ONCE(irqs_disabled());
+
+ while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+ cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a51023..42ba65dff7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
#endif
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk(KERN_ERR
+ "BUG: looking up invalid subclass: %u\n", subclass);
+ printk(KERN_ERR
+ "turning off the locking correctness validator.\n");
+ dump_stack();
+ return NULL;
+ }
+
/*
* Static locks do not have their class-keys yet - for them the key
* is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
raw_local_irq_restore(flags);
if (!subclass || force)
- lock->class_cache = class;
+ lock->class_cache[0] = class;
+ else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ lock->class_cache[subclass] = class;
if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- lock->class_cache = NULL;
+ int i;
+
+ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+ lock->class_cache[i] = NULL;
+
#ifdef CONFIG_LOCK_STAT
lock->cpu = raw_smp_processor_id();
#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
- debug_locks_off();
- printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
- return 0;
- }
-
if (lock->key == &__lockdep_no_validate__)
check = 1;
- if (!subclass)
- class = lock->class_cache;
+ if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ class = lock->class_cache[subclass];
/*
- * Not cached yet or subclass?
+ * Not cached?
*/
if (unlikely(!class)) {
class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
return 1;
if (hlock->references) {
- struct lock_class *class = lock->class_cache;
+ struct lock_class *class = lock->class_cache[0];
if (!class)
class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
if (list_empty(head))
continue;
list_for_each_entry_safe(class, next, head, hash_entry) {
- if (unlikely(class == lock->class_cache)) {
+ int match = 0;
+
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ match |= class == lock->class_cache[j];
+
+ if (unlikely(match)) {
if (debug_locks_off_graph_unlock())
WARN_ON(1);
goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
* Careful: only use this function if you are sure that
* the task cannot run in parallel!
*/
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
}
lockdep_print_held_locks(task);
}
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-
-void debug_show_held_locks(struct task_struct *task)
-{
- __debug_show_held_locks(task);
-}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
void lockdep_sys_exit(void)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1ec3916ffef..f309e8014c7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -34,7 +34,7 @@
#include <asm/irq_regs.h>
-static atomic_t nr_events __read_mostly;
+atomic_t perf_task_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
@@ -315,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader;
- WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+ /*
+ * We can have double attach due to group movement in perf_event_open.
+ */
+ if (event->attach_state & PERF_ATTACH_GROUP)
+ return;
+
event->attach_state |= PERF_ATTACH_GROUP;
if (group_leader == event)
@@ -412,8 +417,8 @@ event_filter_match(struct perf_event *event)
return event->cpu == -1 || event->cpu == smp_processor_id();
}
-static void
-event_sched_out(struct perf_event *event,
+static int
+__event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -432,14 +437,13 @@ event_sched_out(struct perf_event *event,
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
- return;
+ return 0;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = ctx->time;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -448,6 +452,19 @@ event_sched_out(struct perf_event *event,
ctx->nr_active--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+ return 1;
+}
+
+static void
+event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ int ret;
+
+ ret = __event_sched_out(event, cpuctx, ctx);
+ if (ret)
+ event->tstamp_stopped = ctx->time;
}
static void
@@ -647,7 +664,7 @@ retry:
}
static int
-event_sched_in(struct perf_event *event,
+__event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -667,8 +684,6 @@ event_sched_in(struct perf_event *event,
return -EAGAIN;
}
- event->tstamp_running += ctx->time - event->tstamp_stopped;
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
ctx->nr_active++;
@@ -679,6 +694,35 @@ event_sched_in(struct perf_event *event,
return 0;
}
+static inline int
+event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ int ret = __event_sched_in(event, cpuctx, ctx);
+ if (ret)
+ return ret;
+ event->tstamp_running += ctx->time - event->tstamp_stopped;
+ return 0;
+}
+
+static void
+group_commit_event_sched_in(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+ u64 now = ctx->time;
+
+ group_event->tstamp_running += now - group_event->tstamp_stopped;
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ }
+}
+
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
@@ -692,7 +736,13 @@ group_sched_in(struct perf_event *group_event,
pmu->start_txn(pmu);
- if (event_sched_in(group_event, cpuctx, ctx)) {
+ /*
+ * use __event_sched_in() to delay updating tstamp_running
+ * until the transaction is committed. In case of failure
+ * we will keep an unmodified tstamp_running which is a
+ * requirement to get correct timing information
+ */
+ if (__event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
return -EAGAIN;
}
@@ -701,26 +751,31 @@ group_sched_in(struct perf_event *group_event,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (__event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
}
- if (!pmu->commit_txn(pmu))
+ if (!pmu->commit_txn(pmu)) {
+ /* commit tstamp_running */
+ group_commit_event_sched_in(group_event, cpuctx, ctx);
return 0;
-
+ }
group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
+ *
+ * use __event_sched_out() to avoid updating tstamp_stopped
+ * because the event never actually ran
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ __event_sched_out(event, cpuctx, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ __event_sched_out(group_event, cpuctx, ctx);
pmu->cancel_txn(pmu);
@@ -1256,8 +1311,8 @@ void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* accessing the event control register. If a NMI hits, then it will
* not restart the event.
*/
-void perf_event_task_sched_out(struct task_struct *task,
- struct task_struct *next)
+void __perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next)
{
int ctxn;
@@ -1285,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
/*
* Called with IRQs disabled
*/
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
- task_ctx_sched_out(ctx, EVENT_ALL);
-}
-
-/*
- * Called with IRQs disabled
- */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
@@ -1439,7 +1486,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
* accessing the event control register. If a NMI hits, then it will
* keep the event running.
*/
-void perf_event_task_sched_in(struct task_struct *task)
+void __perf_event_task_sched_in(struct task_struct *task)
{
struct perf_event_context *ctx;
int ctxn;
@@ -1780,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
- update_context_time(ctx);
+ /*
+ * may read while context is not active
+ * (e.g., thread is blocked), in that case
+ * we cannot update context time
+ */
+ if (ctx->is_active)
+ update_context_time(ctx);
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -2129,11 +2182,9 @@ retry:
}
}
- put_task_struct(task);
return ctx;
errout:
- put_task_struct(task);
return ERR_PTR(err);
}
@@ -2150,15 +2201,15 @@ static void free_event_rcu(struct rcu_head *head)
kfree(event);
}
-static void perf_pending_sync(struct perf_event *event);
static void perf_buffer_put(struct perf_buffer *buffer);
static void free_event(struct perf_event *event)
{
- perf_pending_sync(event);
+ irq_work_sync(&event->pending);
if (!event->parent) {
- atomic_dec(&nr_events);
+ if (event->attach_state & PERF_ATTACH_TASK)
+ jump_label_dec(&perf_task_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2458,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event,
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
struct perf_event_context *ctx = event->ctx;
- unsigned long size;
int ret = 0;
u64 value;
if (!event->attr.sample_period)
return -EINVAL;
- size = copy_from_user(&value, arg, sizeof(value));
- if (size != sizeof(value))
+ if (copy_from_user(&value, arg, sizeof(value)))
return -EFAULT;
if (!value)
@@ -3106,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
}
}
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct irq_work *entry)
{
struct perf_event *event = container_of(entry,
struct perf_event, pending);
@@ -3131,89 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
}
}
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
- PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
- void (*func)(struct perf_pending_entry *))
-{
- struct perf_pending_entry **head;
-
- if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
- return;
-
- entry->func = func;
-
- head = &get_cpu_var(perf_pending_head);
-
- do {
- entry->next = *head;
- } while (cmpxchg(head, entry->next, entry) != entry->next);
-
- set_perf_event_pending();
-
- put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
- struct perf_pending_entry *list;
- int nr = 0;
-
- list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
- while (list != PENDING_TAIL) {
- void (*func)(struct perf_pending_entry *);
- struct perf_pending_entry *entry = list;
-
- list = list->next;
-
- func = entry->func;
- entry->next = NULL;
- /*
- * Ensure we observe the unqueue before we issue the wakeup,
- * so that we won't be waiting forever.
- * -- see perf_not_pending().
- */
- smp_wmb();
-
- func(entry);
- nr++;
- }
-
- return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
- /*
- * If we flush on whatever cpu we run, there is a chance we don't
- * need to wait.
- */
- get_cpu();
- __perf_pending_run();
- put_cpu();
-
- /*
- * Ensure we see the proper queue state before going to sleep
- * so that we do not miss the wakeup. -- see perf_pending_handle()
- */
- smp_rmb();
- return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
- wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
- __perf_pending_run();
-}
-
/*
* We assume there is only KVM supporting the callbacks.
* Later on, we might change it to a list if there is
@@ -3263,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
if (handle->nmi) {
handle->event->pending_wakeup = 1;
- perf_pending_queue(&handle->event->pending,
- perf_pending_event);
+ irq_work_queue(&handle->event->pending);
} else
perf_event_wakeup(handle->event);
}
@@ -4300,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
event->pending_kill = POLL_HUP;
if (nmi) {
event->pending_disable = 1;
- perf_pending_queue(&event->pending,
- perf_pending_event);
+ irq_work_queue(&event->pending);
} else
perf_event_disable(event);
}
@@ -4712,7 +4667,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
- atomic_dec(&perf_swevent_enabled[event_id]);
+ jump_label_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
}
@@ -4742,7 +4697,7 @@ static int perf_swevent_init(struct perf_event *event)
if (err)
return err;
- atomic_inc(&perf_swevent_enabled[event_id]);
+ jump_label_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
@@ -5291,9 +5246,10 @@ unlock:
*/
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
- struct perf_event *group_leader,
- struct perf_event *parent_event,
- perf_overflow_handler_t overflow_handler)
+ struct task_struct *task,
+ struct perf_event *group_leader,
+ struct perf_event *parent_event,
+ perf_overflow_handler_t overflow_handler)
{
struct pmu *pmu;
struct perf_event *event;
@@ -5318,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
init_waitqueue_head(&event->waitq);
+ init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
@@ -5334,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (task) {
+ event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ /*
+ * hw_breakpoint is a bit difficult here..
+ */
+ if (attr->type == PERF_TYPE_BREAKPOINT)
+ event->hw.bp_target = task;
+#endif
+ }
+
if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
@@ -5377,7 +5345,8 @@ done:
event->pmu = pmu;
if (!event->parent) {
- atomic_inc(&nr_events);
+ if (event->attach_state & PERF_ATTACH_TASK)
+ jump_label_inc(&perf_task_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -5586,10 +5555,18 @@ SYSCALL_DEFINE5(perf_event_open,
group_leader = NULL;
}
- event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
+ if (pid != -1) {
+ task = find_lively_task_by_vpid(pid);
+ if (IS_ERR(task)) {
+ err = PTR_ERR(task);
+ goto err_group_fd;
+ }
+ }
+
+ event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_fd;
+ goto err_task;
}
/*
@@ -5621,21 +5598,13 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- if (pid != -1) {
- task = find_lively_task_by_vpid(pid);
- if (IS_ERR(task)) {
- err = PTR_ERR(task);
- goto err_group_fd;
- }
- }
-
/*
* Get the target context (task or percpu):
*/
ctx = find_get_context(pmu, task, cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_group_fd;
+ goto err_alloc;
}
/*
@@ -5731,9 +5700,13 @@ SYSCALL_DEFINE5(perf_event_open,
err_context:
put_ctx(ctx);
+err_alloc:
+ free_event(event);
+err_task:
+ if (task)
+ put_task_struct(task);
err_group_fd:
fput_light(group_file, fput_needed);
- free_event(event);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -5759,7 +5732,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
* Get the target context (task or percpu):
*/
- event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
+ event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err;
@@ -5868,7 +5841,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* our context.
*/
child_ctx = child->perf_event_ctxp[ctxn];
- __perf_event_task_sched_out(child_ctx);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Take the context lock here so that if find_get_context is
@@ -6027,6 +6000,7 @@ inherit_event(struct perf_event *parent_event,
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
+ child,
group_leader, parent_event,
NULL);
if (IS_ERR(child_event))
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d08..39b65b69584 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
- first = rcu_dereference_check(pid->tasks[type].first,
+ first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
rcu_read_lock_held() ||
lockdep_tasklist_lock_is_held());
if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
+ rcu_lockdep_assert(rcu_read_lock_held());
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008..2531017795f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
* provides serialisation for access to the entire console
* driver system.
*/
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
@@ -556,7 +556,7 @@ static void zap_locks(void)
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
- init_MUTEX(&console_sem);
+ sema_init(&console_sem, 1);
}
#if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb3..a23a57a976d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
/**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
* Check for bottom half being disabled, which covers both the
* CONFIG_PROVE_RCU and not cases. Note that if someone uses
* rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation. This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
*/
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
- return in_softirq();
+ return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be..d806735342a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu),
+ struct rcu_ctrlblk *rcp);
+
+#include "rcutiny_plugin.h"
+
#ifdef CONFIG_NO_HZ
static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
rcu_sched_qs(cpu);
else if (!in_softirq())
rcu_bh_qs(cpu);
+ rcu_preempt_check_callbacks();
}
/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
*rcp->donetail = NULL;
if (rcp->curtail == rcp->donetail)
rcp->curtail = &rcp->rcucblist;
+ rcu_preempt_remove_callbacks(rcp);
rcp->donetail = &rcp->rcucblist;
local_irq_restore(flags);
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
}
/*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
}
/*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
* period. But since we have but one CPU, that would be after any
* quiescent state.
*/
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
__call_rcu(head, func, &rcu_sched_ctrlblk);
}
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
/*
* Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier(void)
-{
- struct rcu_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
void rcu_barrier_bh(void)
{
struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
-
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc74..6ceca4f745f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
/*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
* Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
*
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#ifdef CONFIG_TINY_PREEMPT_RCU
+
+#include <linux/delay.h>
+
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+ struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+ struct rcu_head **nexttail;
+ /* Tasks blocked in a preemptible RCU */
+ /* read-side critical section while an */
+ /* preemptible-RCU grace period is in */
+ /* progress must wait for a later grace */
+ /* period. This pointer points to the */
+ /* ->next pointer of the last task that */
+ /* must wait for a later grace period, or */
+ /* to &->rcb.rcucblist if there is no */
+ /* such task. */
+ struct list_head blkd_tasks;
+ /* Tasks blocked in RCU read-side critical */
+ /* section. Tasks are placed at the head */
+ /* of this list and age towards the tail. */
+ struct list_head *gp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current grace period, or NULL if there */
+ /* is not such task. */
+ struct list_head *exp_tasks;
+ /* Pointer to first task blocking the */
+ /* current expedited grace period, or NULL */
+ /* if there is no such task. If there */
+ /* is no current expedited grace period, */
+ /* then there cannot be any such task. */
+ u8 gpnum; /* Current grace period. */
+ u8 gpcpu; /* Last grace period blocked by the CPU. */
+ u8 completed; /* Last grace period completed. */
+ /* If all three are equal, RCU is idle. */
+};
+
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+ .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+ return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Check for a running RCU reader. Because there is only one CPU,
+ * there can be but one running RCU reader at a time. ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+ return current->rcu_read_lock_nesting;
+}
+
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+ return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+ return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+ return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+ return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state. There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state. The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time. Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period. (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+ /* Record both CPU and task as having responded to current GP. */
+ rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+ current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+
+ /*
+ * If there is no GP, or if blocked readers are still blocking GP,
+ * then there is nothing more to do.
+ */
+ if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+ return;
+
+ /* Advance callbacks. */
+ rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+ rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+ rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+
+ /* If there are no blocked readers, next GP is done instantly. */
+ if (!rcu_preempt_blocked_readers_any())
+ rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+
+ /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+ raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Start a new RCU grace period if warranted. Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+ if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+
+ /* Official start of GP. */
+ rcu_preempt_ctrlblk.gpnum++;
+
+ /* Any blocked RCU readers block new GP. */
+ if (rcu_preempt_blocked_readers_any())
+ rcu_preempt_ctrlblk.gp_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+
+ /* If there is no running reader, CPU is done with GP. */
+ if (!rcu_preempt_running_reader())
+ rcu_preempt_cpu_qs();
+ }
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from. If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+ struct task_struct *t = current;
+ unsigned long flags;
+
+ local_irq_save(flags); /* must exclude scheduler_tick(). */
+ if (rcu_preempt_running_reader() &&
+ (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+ /* Possibly blocking in an RCU read-side critical section. */
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * If this CPU has already checked in, then this task
+ * will hold up the next grace period rather than the
+ * current grace period. Queue the task accordingly.
+ * If the task is queued for the current grace period
+ * (i.e., this CPU has not yet passed through a quiescent
+ * state for the current grace period), then as long
+ * as that task remains queued, the current grace period
+ * cannot end.
+ */
+ list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+ if (rcu_cpu_blocking_cur_gp())
+ rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+ }
+
+ /*
+ * Either we were not in an RCU read-side critical section to
+ * begin with, or we have now recorded that critical section
+ * globally. Either way, we can now note a quiescent state
+ * for this CPU. Again, if we were in an RCU read-side critical
+ * section, and if that critical section was blocking the current
+ * grace period, then the fact that the task has been enqueued
+ * means that current grace period continues to be blocked.
+ */
+ rcu_preempt_cpu_qs();
+ local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+ int empty;
+ int empty_exp;
+ unsigned long flags;
+ struct list_head *np;
+ int special;
+
+ /*
+ * NMI handlers cannot block and cannot safely manipulate state.
+ * They therefore cannot possibly be special, so just leave.
+ */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * If RCU core is waiting for this CPU to exit critical section,
+ * let it know that we have done so.
+ */
+ special = t->rcu_read_unlock_special;
+ if (special & RCU_READ_UNLOCK_NEED_QS)
+ rcu_preempt_cpu_qs();
+
+ /* Hardware IRQ handlers cannot block. */
+ if (in_irq()) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* Clean up if blocked during RCU read-side critical section. */
+ if (special & RCU_READ_UNLOCK_BLOCKED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * Remove this task from the ->blkd_tasks list and adjust
+ * any pointers that might have been referencing it.
+ */
+ empty = !rcu_preempt_blocked_readers_cgp();
+ empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ list_del(&t->rcu_node_entry);
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+ rcu_preempt_ctrlblk.gp_tasks = np;
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+ rcu_preempt_ctrlblk.exp_tasks = np;
+ INIT_LIST_HEAD(&t->rcu_node_entry);
+
+ /*
+ * If this was the last task on the current list, and if
+ * we aren't waiting on the CPU, report the quiescent state
+ * and start a new grace period if needed.
+ */
+ if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+ rcu_preempt_cpu_qs();
+ rcu_preempt_start_gp();
+ }
+
+ /*
+ * If this was the last task on the expedited lists,
+ * then we need wake up the waiting task.
+ */
+ if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_report_exp_done();
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+ --t->rcu_read_lock_nesting;
+ barrier(); /* decrement before load of ->rcu_read_unlock_special */
+ if (t->rcu_read_lock_nesting == 0 &&
+ unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+ WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * Check for a quiescent state from the current CPU. When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere. This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+ struct task_struct *t = current;
+
+ if (rcu_preempt_gp_in_progress() &&
+ (!rcu_preempt_running_reader() ||
+ !rcu_cpu_blocking_cur_gp()))
+ rcu_preempt_cpu_qs();
+ if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+ rcu_preempt_ctrlblk.rcb.donetail)
+ raise_softirq(RCU_SOFTIRQ);
+ if (rcu_preempt_gp_in_progress() &&
+ rcu_cpu_blocking_cur_gp() &&
+ rcu_preempt_running_reader())
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case. Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+ if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+ rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+ __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+
+ debug_rcu_head_queue(head);
+ head->func = func;
+ head->next = NULL;
+
+ local_irq_save(flags);
+ *rcu_preempt_ctrlblk.nexttail = head;
+ rcu_preempt_ctrlblk.nexttail = &head->next;
+ rcu_preempt_start_gp(); /* checks to see if GP needed. */
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void rcu_barrier(void)
+{
+ struct rcu_synchronize rcu;
+
+ init_rcu_head_on_stack(&rcu.head);
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ call_rcu(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!rcu_scheduler_active)
+ return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+ WARN_ON_ONCE(rcu_preempt_running_reader());
+ if (!rcu_preempt_blocked_readers_any())
+ return;
+
+ /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+ rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+ return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+ wake_up(&sync_rcu_preempt_exp_wq);
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but expedite it. The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section. Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list. So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+ unsigned long snap;
+
+ barrier(); /* ensure prior action seen before grace period. */
+
+ WARN_ON_ONCE(rcu_preempt_running_reader());
+
+ /*
+ * Acquire lock so that there is only one preemptible RCU grace
+ * period in flight. Of course, if someone does the expedited
+ * grace period for us while we are acquiring the lock, just leave.
+ */
+ snap = sync_rcu_preempt_exp_count + 1;
+ mutex_lock(&sync_rcu_preempt_exp_mutex);
+ if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+ goto unlock_mb_ret; /* Others did our work for us. */
+
+ local_irq_save(flags);
+
+ /*
+ * All RCU readers have to already be on blkd_tasks because
+ * we cannot legally be executing in an RCU read-side critical
+ * section.
+ */
+
+ /* Snapshot current head of ->blkd_tasks list. */
+ rpcp->exp_tasks = rpcp->blkd_tasks.next;
+ if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+ rpcp->exp_tasks = NULL;
+ local_irq_restore(flags);
+
+ /* Wait for tail of ->blkd_tasks list to drain. */
+ if (rcu_preempted_readers_exp())
+ wait_event(sync_rcu_preempt_exp_wq,
+ !rcu_preempted_readers_exp());
+
+ /* Clean up and exit. */
+ barrier(); /* ensure expedited GP seen before counter increment. */
+ sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+ mutex_unlock(&sync_rcu_preempt_exp_mutex);
+ barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+ if (!rcu_preempt_running_reader())
+ rcu_preempt_cpu_qs();
+ return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0)
+ return;
+ t->rcu_read_lock_nesting = 1;
+ rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b..9d8e8fb2515 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
};
static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
static long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */
- /* of kthreads. */
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
/*
* Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
mdelay(longdelay_ms);
if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+ preempt_schedule(); /* No QS if preempt_disable() in effect */
+#endif
}
static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
if (!delay)
schedule_timeout_interruptible(longdelay);
+ else
+ rcu_read_delay(rrsp);
}
static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
continue;
rp->rtort_pipe_count = 0;
udelay(rcu_random(&rand) & 0x3ff);
- old_rp = rcu_torture_current;
+ old_rp = rcu_dereference_check(rcu_torture_current,
+ current == writer_task);
rp->rtort_mbtest = 1;
rcu_assign_pointer(rcu_torture_current, rp);
smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5..ccdc04c4798 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
static int rcu_pending(int cpu);
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- /* OK, time to rat on our buddy... */
-
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
rsp->name);
rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
long delta;
struct rcu_node *rnp;
- if (rcu_cpu_stall_panicking)
+ if (rcu_cpu_stall_suppress)
return;
- delta = jiffies - rsp->jiffies_stall;
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
rnp = rdp->mynode;
- if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+ if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
- rcu_cpu_stall_panicking = 1;
+ rcu_cpu_stall_suppress = 1;
return NOTIFY_DONE;
}
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_preempt_stall_reset();
+}
+
static struct notifier_block rcu_panic_block = {
.notifier_call = rcu_panic,
};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
}
+void rcu_cpu_stall_reset(void)
+{
+}
+
static void __init check_cpu_stall_init(void)
{
}
@@ -712,7 +745,7 @@ static void
rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
- struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
{
int i;
- struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
rsp->orphan_qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen = 0;
raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
struct rcu_data *rdp;
raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rdp = rsp->rda[smp_processor_id()];
+ rdp = this_cpu_ptr(rsp->rda);
if (rsp->orphan_cbs_list == NULL) {
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
rdp->qlen += rsp->orphan_qlen;
+ rdp->n_cbs_adopted += rsp->orphan_qlen;
rsp->orphan_cbs_list = NULL;
rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
int need_report = 0;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp;
/* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Update count, and requeue any remaining callbacks. */
rdp->qlen -= count;
+ rdp->n_cbs_invoked += count;
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+ if ((rnp->qsmask & bit) != 0 &&
+ f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* a quiescent state betweentimes.
*/
local_irq_save(flags);
- rdp = rsp->rda[smp_processor_id()];
+ rdp = this_cpu_ptr(rsp->rda);
rcu_process_gp_end(rsp, rdp);
check_for_new_grace_period(rsp, rdp);
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
int i;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
{
unsigned long flags;
unsigned long mask;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+ struct rcu_data __percpu *rda)
{
static char *buf[] = { "rcu_node_level_0",
"rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
}
}
+ rsp->rda = rda;
rnp = rsp->level[NUM_RCU_LVLS - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
- rsp->rda[i]->mynode = rnp;
+ per_cpu_ptr(rsp->rda, i)->mynode = rnp;
rcu_boot_init_percpu_data(i, rsp);
}
}
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
- * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
- int i; \
- \
- for_each_possible_cpu(i) { \
- (rsp)->rda[i] = &per_cpu(rcu_data, i); \
- } \
- rcu_init_one(rsp); \
-} while (0)
-
void __init rcu_init(void)
{
int cpu;
rcu_bootup_announce();
- RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
- RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+ rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed..91d4170c5c1 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
long qlen; /* # of queued callbacks */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
+ unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
#define RCU_STALL_DELAY_DELTA 0
#endif
-#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+ RCU_STALL_DELAY_DELTA)
/* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
/* for rsp->jiffies_stall */
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
/* to take at least one */
/* scheduling clock irq */
/* before ratting on them. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
-#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
-#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
- struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
+ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
/* The following fields are guarded by the root rcu_node's lock. */
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d..71a4147473f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
printk(KERN_INFO
"\tRCU-based detection of stalled CPUs is disabled.\n");
#endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
#endif
#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = rcu_preempt_state.rda[cpu];
+ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
*/
void __rcu_read_lock(void)
{
- ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+ current->rcu_read_lock_nesting++;
barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
struct task_struct *t = current;
barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
- if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+ --t->rcu_read_lock_nesting;
+ barrier(); /* decrement before load of ->rcu_read_unlock_special */
+ if (t->rcu_read_lock_nesting == 0 &&
unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);
#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
}
}
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+ rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
+
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
*
* Control will return to the caller some time after a full grace
* period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*/
void synchronize_rcu(void)
{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
*/
static void __init __rcu_init_preempt(void)
{
- RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+ rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
}
/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
{
}
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
+
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
}
/*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
* But because preemptable RCU does not exist, map to rcu-sched.
*/
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738..d15430b9d12 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+ rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%lu,%lu,%lu\n",
+ rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
#ifdef CONFIG_NO_HZ
seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
- seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
#ifdef CONFIG_TREE_PREEMPT_RCU
seq_puts(m, "\"rcu_preempt:\"\n");
PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
struct rcu_data *rdp;
for_each_possible_cpu(cpu) {
- rdp = rsp->rda[cpu];
+ rdp = per_cpu_ptr(rsp->rda, cpu);
if (rdp->beenonline)
print_one_rcu_pending(m, rdp);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index c0d2067f3e0..d42992bccdf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
atomic_t rto_count;
-#ifdef CONFIG_SMP
struct cpupri cpupri;
-#endif
};
/*
@@ -437,7 +435,7 @@ struct root_domain {
*/
static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
/*
* This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle;
+ struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
+ u64 clock_task;
atomic_t nr_iowait;
@@ -520,6 +519,10 @@ struct rq {
u64 avg_idle;
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
inline void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update)
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update) {
+ int cpu = cpu_of(rq);
+ u64 irq_time;
+
+ rq->clock = sched_clock_cpu(cpu);
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
+
+ sched_irq_time_avg_update(rq, irq_time);
+ }
}
/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
- char *cmp = buf;
+ char *cmp;
int neg = 0;
int i;
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
return -EFAULT;
buf[cnt] = 0;
+ cmp = strstrip(buf);
if (strncmp(buf, "NO_", 3) == 0) {
neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
for (i = 0; sched_feat_names[i]; i++) {
- int len = strlen(sched_feat_names[i]);
-
- if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg)
sysctl_sched_features &= ~(1UL << i);
else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
- if (task_has_rt_policy(p)) {
- p->se.load.weight = 0;
- p->se.load.inv_weight = WMULT_CONST;
- return;
- }
-
/*
* SCHED_IDLE tasks get minimal weight:
*/
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dec_nr_running(rq);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+ if (!sched_clock_irqtime)
+ return 0;
+
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ int cpu;
+ u64 now, delta;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ now = sched_clock_cpu(cpu);
+ delta = now - per_cpu(irq_start_time, cpu);
+ per_cpu(irq_start_time, cpu) = now;
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ per_cpu(cpu_hardirq_time, cpu) += delta;
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ per_cpu(cpu_softirq_time, cpu) += delta;
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+ if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+ u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+ rq->prev_irq_time = curr_irq_time;
+ sched_rt_avg_update(rq, delta_irq);
+ }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+ return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
/*
* __normal_prio - return the priority that is based on the static prio
*/
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
if (p->sched_class != &fair_sched_class)
return 0;
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
- if (likely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (likely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock - p->se.exec_start;
+ ns = rq->clock_task - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
- else if (softirq_count())
+ else if (in_serving_softirq())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
return p;
}
- class = sched_class_highest;
- for ( ; ; ) {
+ for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
- /*
- * Will never be NULL as the idle class always
- * returns a non-NULL p:
- */
- class = class->next;
}
+
+ BUG(); /* the idle class will always have a runnable task */
}
/*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
rq = task_rq_lock(p, &flags);
+ trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
}
if (user) {
- retval = security_task_setscheduler(p, policy, param);
+ retval = security_task_setscheduler(p);
if (retval)
return retval;
}
@@ -4661,6 +4788,15 @@ recheck:
*/
rq = __task_rq_lock(p);
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EINVAL;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->se.exec_start = sched_clock();
cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
__set_task_cpu(idle, cpu);
+ rcu_read_unlock();
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
cpumask_var_t nodemask;
cpumask_var_t this_sibling_map;
cpumask_var_t this_core_map;
+ cpumask_var_t this_book_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
+ sa_this_book_map,
sa_this_core_map,
sa_this_sibling_map,
sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
-
+#ifdef CONFIG_SCHED_SMT
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
+#else
+ group = cpu;
+#endif
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
return group;
}
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *mask)
{
+ int group = cpu;
+#ifdef CONFIG_SCHED_MC
+ cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#endif
if (sg)
- *sg = &per_cpu(sched_group_core, cpu).sg;
- return cpu;
+ *sg = &per_cpu(sched_group_book, group).sg;
+ return group;
}
-#endif
+#endif /* CONFIG_SCHED_BOOK */
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+ cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
#ifdef CONFIG_SCHED_MC
SD_INIT_FUNC(MC)
#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
static int default_relax_domain_level = -1;
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
+ case sa_this_book_map:
+ free_cpumask_var(d->this_book_map); /* fall through */
case sa_this_core_map:
free_cpumask_var(d->this_core_map); /* fall through */
case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
return sa_nodemask;
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
return sa_this_core_map;
+ if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ return sa_this_book_map;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
return sd;
}
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+ sd = &per_cpu(book_domains, i).sd;
+ SD_INIT(sd, BOOK);
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+ sd->parent = parent;
+ parent->child = sd;
+ cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+ return sd;
+}
+
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
@@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
d->send_covered, d->tmpmask);
break;
#endif
+#ifdef CONFIG_SCHED_BOOK
+ case SD_LV_BOOK: /* set up book groups */
+ cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+ if (cpu == cpumask_first(d->this_book_map))
+ init_sched_build_groups(d->this_book_map, cpu_map,
+ &cpu_to_book_group,
+ d->send_covered, d->tmpmask);
+ break;
+#endif
case SD_LV_CPU: /* set up physical groups */
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
}
for_each_cpu(i, cpu_map) {
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
}
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
init_sched_groups_power(i, sd);
}
#endif
+#ifdef CONFIG_SCHED_BOOK
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(book_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
+#endif
for_each_cpu(i, cpu_map) {
sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+ sd = &per_cpu(book_domains, i).sd;
#else
sd = &per_cpu(phys_domains, i).sd;
#endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
- err_free_rq:
+err_free_rq:
kfree(cfs_rq);
- err:
+err:
return 0;
}
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
- err_free_rq:
+err_free_rq:
kfree(rt_rq);
- err:
+err:
return 0;
}
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49..933f3d1b62e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock;
+ se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
+
+ /* re-arm NEWIDLE balancing when moving tasks */
+ src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+ this_rq->idle_stamp = 0;
}
/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq->clock, sd);
+ tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
+ unsigned long this_has_capacity;
/* Statistics of the busiest group */
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
u64 total, available;
total = sched_avg_period() + (rq->clock - rq->age_stamp);
- available = total - rq->rt_avg;
+
+ if (unlikely(total < rq->rt_avg)) {
+ /* Ensures that power won't end up being negative */
+ available = 0;
+ } else {
+ available = total - rq->rt_avg;
+ }
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
+ max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
+ if (load > max_cpu_load) {
max_cpu_load = load;
+ max_nr_running = rq->nr_running;
+ }
if (min_cpu_load > load)
min_cpu_load = load;
}
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity =
- DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
+
+ if (sgs->group_capacity > sgs->sum_nr_running)
+ sgs->group_has_capacity = 1;
}
/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
- * and move all the excess tasks away.
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks, i.e. nr_running < group_capacity. The
+ * extra check prevents the case where you always pull from the
+ * heaviest group when it is already under-utilized (possible
+ * with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling)
+ if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
+ sds->this_has_capacity = sgs.group_has_capacity;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
return fix_small_imbalance(sds, this_cpu, imbalance);
}
+
/******* find_busiest_group() helpers end here *********************/
/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
+ *
+ * Note: when doing newidle balance, if the local group has excess
+ * capacity (i.e. nr_running < group_capacity) and the busiest group
+ * does not have any capacity, we force a load balance to pull tasks
+ * to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ !sds.busiest_has_capacity)
+ goto force_balance;
+
if (sds.this_load >= sds.max_load)
goto out_balanced;
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
+force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
- sd->nr_balance_failed++;
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ if (pulled_task)
break;
- }
}
raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
update_rq_clock(rq);
- if (unlikely(task_cpu(p) != this_cpu))
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3e..185f920ec1a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
* release the lock. Decreases scheduling overhead.
*/
SCHED_FEAT(OWNER_SPIN, 1)
+
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67..bea7d79f7e9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
if (!task_has_rt_policy(curr))
return;
- delta_exec = rq->clock - curr->se.exec_start;
+ delta_exec = rq->clock_task - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock;
+ curr->se.exec_start = rq->clock_task;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
*
- * We want to avoid overloading runqueues. Even if
- * the RT task is of higher priority than the current RT task.
- * RT tasks behave differently than other tasks. If
- * one gets preempted, we try to push it off to another queue.
- * So trying to keep a preempting RT task on the same
- * cache hot CPU will force the running RT task to
- * a cold CPU. So we waste all the cache for the lower
- * RT task in hopes of saving some of a RT task
- * that is just being woken and probably will have
- * cold cache anyway.
+ * We want to avoid overloading runqueues. If the woken
+ * task is a higher priority, then it will stay on this CPU
+ * and the lower prio task should be moved to another CPU.
+ * Even though this will probably make the lower prio task
+ * lose its cache, we do not want to bounce a higher task
+ * around just because it gave up its CPU, perhaps for a
+ * lock?
+ *
+ * For equal prio tasks, we just let the scheduler sort it out.
*/
if (unlikely(rt_task(rq->curr)) &&
+ (rq->curr->rt.nr_cpus_allowed < 2 ||
+ rq->curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
int cpu = find_lowest_rq(p);
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq->clock;
+ p->se.exec_start = rq->clock_task;
return p;
}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
for_each_leaf_rt_rq(rt_rq, rq) {
array = &rt_rq->active;
idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
if (idx >= MAX_RT_PRIO)
continue;
if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
- retry:
+retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
* but possible)
*/
}
- skip:
+skip:
double_unlock_balance(this_rq, src_rq);
}
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1)
+ p->rt.nr_cpus_allowed > 1 &&
+ rt_task(rq->curr) &&
+ (rq->curr->rt.nr_cpus_allowed < 2 ||
+ rq->curr->prio < p->prio))
push_rt_tasks(rq);
}
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
- p->se.exec_start = rq->clock;
+ p->se.exec_start = rq->clock_task;
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 00000000000..45bddc0c104
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct rq *rq, struct task_struct *p,
+ int sd_flag, int flags)
+{
+ return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ resched_task(rq->curr); /* we preempt everything */
+}
+
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *stop = rq->stop;
+
+ if (stop && stop->state == TASK_RUNNING)
+ return stop;
+
+ return NULL;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+ BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ BUG(); /* its impossible to change to this class */
+}
+
+static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+ int oldprio, int running)
+{
+ BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+ .next = &rt_sched_class,
+
+ .enqueue_task = enqueue_task_stop,
+ .dequeue_task = dequeue_task_stop,
+ .yield_task = yield_task_stop,
+
+ .check_preempt_curr = check_preempt_curr_stop,
+
+ .pick_next_task = pick_next_task_stop,
+ .put_prev_task = put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_stop,
+#endif
+
+ .set_curr_task = set_curr_task_stop,
+ .task_tick = task_tick_stop,
+
+ .get_rr_interval = get_rr_interval_stop,
+
+ .prio_changed = prio_changed_stop,
+ .switched_to = switched_to_stop,
+
+ /* no .task_new for stop tasks */
+};
diff --git a/kernel/signal.c b/kernel/signal.c
index bded6518778..919562c3d6b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2215,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
+#ifdef BUS_MCEERR_AO
+ /*
+ * Other callers might not initialize the si_lsb field,
+ * so check explicitely for the right codes here.
+ */
+ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73..fc978889b19 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
}
/*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ * softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+
+/*
* This one is for softirq.c-internal use,
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
- preempt_count() += SOFTIRQ_OFFSET;
+ preempt_count() += cnt;
/*
* Were softirqs turned off above:
*/
- if (softirq_count() == SOFTIRQ_OFFSET)
+ if (softirq_count() == cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
- if (preempt_count() == SOFTIRQ_OFFSET)
+ if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
- add_preempt_count(SOFTIRQ_OFFSET);
+ add_preempt_count(cnt);
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
void local_bh_disable(void)
{
- __local_bh_disable((unsigned long)__builtin_return_address(0));
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(local_bh_disable);
+static void __local_bh_enable(unsigned int cnt)
+{
+ WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == cnt)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ sub_preempt_count(cnt);
+}
+
/*
* Special-case - softirqs can safely be enabled in
* cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
- WARN_ON_ONCE(!irqs_disabled());
-
- if (softirq_count() == SOFTIRQ_OFFSET)
- trace_softirqs_on((unsigned long)__builtin_return_address(0));
- sub_preempt_count(SOFTIRQ_OFFSET);
+ __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
/*
* Are softirqs going to be turned on now:
*/
- if (softirq_count() == SOFTIRQ_OFFSET)
+ if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
trace_softirqs_on(ip);
/*
* Keep preemption disabled until we are done with
* softirq processing:
*/
- sub_preempt_count(SOFTIRQ_OFFSET - 1);
+ sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
pending = local_softirq_pending();
account_system_vtime(current);
- __local_bh_disable((unsigned long)__builtin_return_address(0));
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
lockdep_softirq_exit();
account_system_vtime(current);
- _local_bh_enable();
+ __local_bh_enable(SOFTIRQ_OFFSET);
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
rcu_irq_enter();
if (idle_cpu(cpu) && !in_interrupt()) {
- __irq_enter();
+ /*
+ * Prevent raise_softirq from needlessly waking up ksoftirqd
+ * here, as softirq will be serviced on return from interrupt.
+ */
+ local_bh_disable();
tick_check_idle(cpu);
- } else
- __irq_enter();
+ _local_bh_enable();
+ }
+
+ __irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
+ current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
@@ -886,17 +910,14 @@ int __init __weak early_irq_init(void)
return 0;
}
+#ifdef CONFIG_GENERIC_HARDIRQS
int __init __weak arch_probe_nr_irqs(void)
{
- return 0;
+ return NR_IRQS_LEGACY;
}
int __init __weak arch_early_irq_init(void)
{
return 0;
}
-
-int __weak arch_init_chip_data(struct irq_desc *desc, int node)
-{
- return 0;
-}
+#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd50..c71e0750053 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
int __init_srcu_struct(struct srcu_struct *sp, const char *name,
struct lock_class_key *key)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
return init_srcu_struct_fields(sp);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb2512..090c28812ce 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
goto repeat;
}
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+
/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
unsigned int cpu = (unsigned long)hcpu;
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
get_task_struct(p);
+ kthread_bind(p, cpu);
+ sched_set_stop_task(cpu, p);
stopper->thread = p;
break;
case CPU_ONLINE:
- kthread_bind(stopper->thread, cpu);
/* strictly unnecessary, as first user will wake it */
wake_up_process(stopper->thread);
/* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
{
struct cpu_stop_work *work;
+ sched_set_stop_task(cpu, NULL);
/* kill the stopper */
kthread_stop(stopper->thread);
/* drain remaining works */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c6d22..3a45c224770 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
kbuf[left] = 0;
}
- for (; left && vleft--; i++, min++, max++, first=0) {
+ for (; left && vleft--; i++, first = 0) {
unsigned long val;
if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c82..10b90d8a03c 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
if (!table->maxlen)
set_fail(&fail, table, "No maxlen");
}
- if ((table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (table->maxlen > sizeof (unsigned long)) {
- if (!table->extra1)
- set_fail(&fail, table, "No min");
- if (!table->extra2)
- set_fail(&fail, table, "No max");
- }
- }
#ifdef CONFIG_PROC_SYSCTL
if (table->procname && !table->proc_handler)
set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a8..d2321891538 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
time_reftime = get_seconds();
offset64 = offset;
- freq_adj = (offset64 * secs) <<
- (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+ freq_adj = ntp_update_offset_fll(offset64, secs);
- freq_adj += ntp_update_offset_fll(offset64, secs);
+ /*
+ * Clamp update interval to reduce PLL gain with low
+ * sampling rate (e.g. intermittent network connection)
+ * to avoid instability.
+ */
+ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+ secs = 1 << (SHIFT_PLL + 1 + time_constant);
+
+ freq_adj += (offset64 * secs) <<
+ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade..68a9ae7679b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
printk_tick();
- perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_run();
+#endif
scheduler_tick();
run_posix_cpu_timers(p);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65fb077ea79..ebd80d50c47 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
ret = ftrace_avail_open(inode, file);
if (!ret) {
- m = (struct seq_file *)file->private_data;
- iter = (struct ftrace_iterator *)m->private;
+ m = file->private_data;
+ iter = m->private;
iter->flags = FTRACE_ITER_FAILURES;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4e2f0341037..c5a632a669e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
int ring_buffer_print_page_header(struct trace_seq *s)
{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f54115..001bcd2ccf4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
static int tracing_release(struct inode *inode, struct file *file)
{
- struct seq_file *m = (struct seq_file *)file->private_data;
+ struct seq_file *m = file->private_data;
struct trace_iterator *iter;
int cpu;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a..9021f8c0c0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ef49e9370b2..76b05980225 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -262,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
return trace_graph_entry(trace);
}
+static void
+__trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long flags, int pc)
+{
+ u64 time = trace_clock_local();
+ struct ftrace_graph_ent ent = {
+ .func = ip,
+ .depth = 0,
+ };
+ struct ftrace_graph_ret ret = {
+ .func = ip,
+ .depth = 0,
+ .calltime = time,
+ .rettime = time,
+ };
+
+ __trace_graph_entry(tr, &ent, flags, pc);
+ __trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ __trace_graph_function(tr, ip, flags, pc);
+}
+
void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags,
@@ -888,12 +916,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
unsigned long addr, int depth)
{
int cpu = iter->cpu;
+ int *depth_irq;
struct fgraph_data *data = iter->private;
- int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
- if (flags & TRACE_GRAPH_PRINT_IRQS)
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
return 0;
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
/*
* We are inside the irq code
*/
@@ -926,12 +962,20 @@ static int
check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
{
int cpu = iter->cpu;
+ int *depth_irq;
struct fgraph_data *data = iter->private;
- int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
- if (flags & TRACE_GRAPH_PRINT_IRQS)
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
return 0;
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
/*
* We are not inside the irq code.
*/
@@ -1163,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
struct ftrace_graph_ent_entry *field;
struct fgraph_data *data = iter->private;
@@ -1226,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
static enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
- return print_graph_function_flags(iter, tracer_flags.val);
+ return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+ u32 flags)
+{
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ return __print_graph_function_flags(iter, flags);
}
static enum print_line_t
@@ -1258,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
seq_printf(s, "#%.*s|||| / \n", size, spaces);
}
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
{
int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
@@ -1299,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
print_graph_headers_flags(s, tracer_flags.val);
}
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+ struct trace_iterator *iter = s->private;
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ print_trace_header(s, iter);
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ } else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ __print_graph_headers_flags(s, flags);
+}
+
void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2..5cf8c602b88 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
#ifdef CONFIG_FUNCTION_TRACER
/*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ * incremented.
+ * 0 if the trace is to be ignored, and data->disabled
+ * is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
*/
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ unsigned long *flags)
{
- struct trace_array *tr = irqsoff_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
long disabled;
int cpu;
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
*/
cpu = raw_smp_processor_id();
if (likely(!per_cpu(tracing_cpu, cpu)))
- return;
+ return 0;
- local_save_flags(flags);
+ local_save_flags(*flags);
/* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(flags))
- return;
+ if (!irqs_disabled_flags(*flags))
+ return 0;
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ *data = tr->data[cpu];
+ disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ return 1;
+
+ atomic_dec(&(*data)->disabled);
+
+ return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return;
+
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
int ret;
- int cpu;
int pc;
- cpu = raw_smp_processor_id();
- if (likely(!per_cpu(tracing_cpu, cpu)))
+ if (!func_prolog_dec(tr, &data, &flags))
return 0;
- local_save_flags(flags);
- /* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(flags))
- return 0;
-
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
- } else
- ret = 0;
-
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
atomic_dec(&data->disabled);
+
return ret;
}
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
- long disabled;
- int cpu;
int pc;
- cpu = raw_smp_processor_id();
- if (likely(!per_cpu(tracing_cpu, cpu)))
+ if (!func_prolog_dec(tr, &data, &flags))
return;
- local_save_flags(flags);
- /* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(flags))
- return;
-
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
- }
-
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
atomic_dec(&data->disabled);
}
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
- u32 flags = GRAPH_TRACER_FLAGS;
-
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
- flags |= TRACE_GRAPH_PRINT_DURATION;
- else
- flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
/*
* In graph mode call the graph tracer output function,
* otherwise go with the TRACE_FN event handler
*/
if (is_graph())
- return print_graph_function_flags(iter, flags);
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
return TRACE_TYPE_UNHANDLED;
}
static void irqsoff_print_header(struct seq_file *s)
{
- if (is_graph()) {
- struct trace_iterator *iter = s->private;
- u32 flags = GRAPH_TRACER_FLAGS;
-
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
- /* print nothing if the buffers are empty */
- if (trace_empty(iter))
- return;
-
- print_trace_header(s, iter);
- flags |= TRACE_GRAPH_PRINT_DURATION;
- } else
- flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
- print_graph_headers_flags(s, flags);
- } else
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
trace_default_header(s);
}
static void
-trace_graph_function(struct trace_array *tr,
- unsigned long ip, unsigned long flags, int pc)
-{
- u64 time = trace_clock_local();
- struct ftrace_graph_ent ent = {
- .func = ip,
- .depth = 0,
- };
- struct ftrace_graph_ret ret = {
- .func = ip,
- .depth = 0,
- .calltime = time,
- .rettime = time,
- };
-
- __trace_graph_entry(tr, &ent, flags, pc);
- __trace_graph_return(tr, &ret, flags, pc);
-}
-
-static void
__trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
unsigned long flags, int pc)
{
- if (!is_graph())
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
trace_function(tr, ip, parent_ip, flags, pc);
- else {
- trace_graph_function(tr, parent_ip, flags, pc);
- trace_graph_function(tr, ip, flags, pc);
- }
}
#else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81..7319559ed59 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int wakeup_rt;
static arch_spinlock_t wakeup_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static void wakeup_reset(struct trace_array *tr);
static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
static int save_lat_flag;
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
#ifdef CONFIG_FUNCTION_TRACER
+
/*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ * is disabled and data->disabled is incremented.
+ * 0 if the trace is to be ignored, and preemption
+ * is not disabled and data->disabled is
+ * kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
*/
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ int *pc)
{
- struct trace_array *tr = wakeup_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
long disabled;
int cpu;
- int pc;
if (likely(!wakeup_task))
- return;
+ return 0;
- pc = preempt_count();
+ *pc = preempt_count();
preempt_disable_notrace();
cpu = raw_smp_processor_id();
if (cpu != wakeup_current_cpu)
goto out_enable;
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ *data = tr->data[cpu];
+ disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
- local_irq_save(flags);
+ return 1;
- trace_function(tr, ip, parent_ip, flags, pc);
+out:
+ atomic_dec(&(*data)->disabled);
+
+out_enable:
+ preempt_enable_notrace();
+ return 0;
+}
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_irq_save(flags);
+ trace_function(tr, ip, parent_ip, flags, pc);
local_irq_restore(flags);
- out:
atomic_dec(&data->disabled);
- out_enable:
preempt_enable_notrace();
}
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
+static int start_func_tracer(int graph)
+{
+ int ret;
+
+ if (!graph)
+ ret = register_ftrace_function(&trace_ops);
+ else
+ ret = register_ftrace_graph(&wakeup_graph_return,
+ &wakeup_graph_entry);
+
+ if (!ret && tracing_is_enabled())
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+ tracer_enabled = 0;
+
+ if (!graph)
+ unregister_ftrace_function(&trace_ops);
+ else
+ unregister_ftrace_graph();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_func_tracer(!set);
+
+ wakeup_reset(wakeup_trace);
+ tracing_max_latency = 0;
+
+ return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc, ret = 0;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return 0;
+
+ local_save_flags(flags);
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_save_flags(flags);
+ __trace_graph_return(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+
+ preempt_enable_notrace();
+ return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
+ trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
/*
* Should this new latency be reported/recorded?
*/
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
/* The task we are waiting for is waking up */
data = wakeup_trace->data[wakeup_cpu];
- trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
- trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- register_ftrace_function(&trace_ops);
-
- if (tracing_is_enabled())
- tracer_enabled = 1;
- else
- tracer_enabled = 0;
+ if (start_func_tracer(is_graph()))
+ printk(KERN_ERR "failed to start wakeup tracer\n");
return;
fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- unregister_ftrace_function(&trace_ops);
+ stop_func_tracer(is_graph());
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.print_max = 1,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
.use_max_tr = 1,
};
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = 1,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
.use_max_tr = 1,
};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d6073a50a6c..e95ee7f31d4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -265,10 +265,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
*/
rcu_assign_pointer(elem->funcs, (*entry)->funcs);
if (!elem->state && active) {
- enable_jump_label(&elem->state);
+ jump_label_enable(&elem->state);
elem->state = active;
} else if (elem->state && !active) {
- disable_jump_label(&elem->state);
+ jump_label_disable(&elem->state);
elem->state = active;
}
}
@@ -285,7 +285,7 @@ static void disable_tracepoint(struct tracepoint *elem)
elem->unregfunc();
if (elem->state) {
- disable_jump_label(&elem->state);
+ jump_label_disable(&elem->state);
elem->state = 0;
}
rcu_assign_pointer(elem->funcs, NULL);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dc8e16824b5..bafba687a6d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
};
/* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
struct perf_sample_data *data,
struct pt_regs *regs)
{