aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt33
-rw-r--r--kernel/cpu.c327
-rw-r--r--kernel/debug/kdb/kdb_io.c6
-rw-r--r--kernel/events/core.c1
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/futex.c84
-rw-r--r--kernel/hrtimer.c370
-rw-r--r--kernel/irq/handle.c8
-rw-r--r--kernel/irq/manage.c100
-rw-r--r--kernel/irq/settings.h12
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c27
-rw-r--r--kernel/itimer.c1
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/locking/Makefile9
-rw-r--r--kernel/locking/lglock.c79
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/percpu-rwsem.c4
-rw-r--r--kernel/locking/rt.c437
-rw-r--r--kernel/locking/rtmutex.c741
-rw-r--r--kernel/locking/rtmutex_common.h14
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c5
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/posix-cpu-timers.c198
-rw-r--r--kernel/posix-timers.c37
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/printk.c146
-rw-r--r--kernel/ptrace.c7
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c143
-rw-r--r--kernel/rcu/tree.h10
-rw-r--r--kernel/rcu/tree_plugin.h160
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/relay.c14
-rw-r--r--kernel/res_counter.c8
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/completion.c34
-rw-r--r--kernel/sched/core.c529
-rw-r--r--kernel/sched/cputime.c62
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/rt.c1
-rw-r--r--kernel/sched/sched.h10
-rw-r--r--kernel/sched/wait-simple.c115
-rw-r--r--kernel/signal.c135
-rw-r--r--kernel/softirq.c733
-rw-r--r--kernel/stop_machine.c98
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c40
-rw-r--r--kernel/time/tick-common.c10
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c30
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/timer.c152
-rw-r--r--kernel/trace/Kconfig104
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/latency_hist.c1178
-rw-r--r--kernel/trace/ring_buffer.c20
-rw-r--r--kernel/trace/trace.c44
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_irqsoff.c11
-rw-r--r--kernel/trace/trace_output.c18
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/watchdog.c16
-rw-r--r--kernel/workqueue.c223
-rw-r--r--kernel/workqueue_internal.h5
72 files changed, 5847 insertions, 853 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ecee67a00f5f..b867a1c4d96f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -225,4 +225,4 @@ config ARCH_SUPPORTS_ATOMIC_RMW
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f02..11dbe26a8279 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,16 @@
+config PREEMPT
+ bool
+ select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+ bool
+ select PREEMPT
+
+config HAVE_PREEMPT_LAZY
+ bool
+
+config PREEMPT_LAZY
+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
choice
prompt "Preemption Model"
@@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
Select this if you are building a kernel for a desktop system.
-config PREEMPT
+config PREEMPT__LL
bool "Preemptible Kernel (Low-Latency Desktop)"
- select PREEMPT_COUNT
+ select PREEMPT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
@@ -52,6 +65,22 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_RTB
+ bool "Preemptible Kernel (Basic RT)"
+ select PREEMPT_RT_BASE
+ help
+ This option is basically the same as (Low-Latency Desktop) but
+ enables changes which are preliminary for the full preemptible
+ RT kernel.
+
+config PREEMPT_RT_FULL
+ bool "Fully Preemptible Kernel (RT)"
+ depends on IRQ_FORCED_THREADING
+ select PREEMPT_RT_BASE
+ select PREEMPT_RCU
+ help
+ All and everything
+
endchoice
config PREEMPT_COUNT
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9a1ba77d6a50..285e18b2c420 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,290 @@ static struct {
.refcount = 0,
};
+/**
+ * hotplug_pcp - per cpu hotplug descriptor
+ * @unplug: set when pin_current_cpu() needs to sync tasks
+ * @sync_tsk: the task that waits for tasks to finish pinned sections
+ * @refcount: counter of tasks in pinned sections
+ * @grab_lock: set when the tasks entering pinned sections should wait
+ * @synced: notifier for @sync_tsk to tell cpu_down it's finished
+ * @mutex: the mutex to make tasks wait (used when @grab_lock is true)
+ * @mutex_init: zero if the mutex hasn't been initialized yet.
+ *
+ * Although @unplug and @sync_tsk may point to the same task, the @unplug
+ * is used as a flag and still exists after @sync_tsk has exited and
+ * @sync_tsk set to NULL.
+ */
+struct hotplug_pcp {
+ struct task_struct *unplug;
+ struct task_struct *sync_tsk;
+ int refcount;
+ int grab_lock;
+ struct completion synced;
+ struct completion unplug_wait;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /*
+ * Note, on PREEMPT_RT, the hotplug lock must save the state of
+ * the task, otherwise the mutex will cause the task to fail
+ * to sleep when required. (Because it's called from migrate_disable())
+ *
+ * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
+ * state.
+ */
+ spinlock_t lock;
+#else
+ struct mutex mutex;
+#endif
+ int mutex_init;
+};
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+#else
+# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
+# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
+#endif
+
+static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
+
+/**
+ * pin_current_cpu - Prevent the current cpu from being unplugged
+ *
+ * Lightweight version of get_online_cpus() to prevent cpu from being
+ * unplugged when code runs in a migration disabled region.
+ *
+ * Must be called with preemption disabled (preempt_count = 1)!
+ */
+void pin_current_cpu(void)
+{
+ struct hotplug_pcp *hp;
+ int force = 0;
+
+retry:
+ hp = &__get_cpu_var(hotplug_pcp);
+
+ if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
+ hp->unplug == current) {
+ hp->refcount++;
+ return;
+ }
+ if (hp->grab_lock) {
+ preempt_enable();
+ hotplug_lock(hp);
+ hotplug_unlock(hp);
+ } else {
+ preempt_enable();
+ /*
+ * Try to push this task off of this CPU.
+ */
+ if (!migrate_me()) {
+ preempt_disable();
+ hp = &__get_cpu_var(hotplug_pcp);
+ if (!hp->grab_lock) {
+ /*
+ * Just let it continue it's already pinned
+ * or about to sleep.
+ */
+ force = 1;
+ goto retry;
+ }
+ preempt_enable();
+ }
+ }
+ preempt_disable();
+ goto retry;
+}
+
+/**
+ * unpin_current_cpu - Allow unplug of current cpu
+ *
+ * Must be called with preemption or interrupts disabled!
+ */
+void unpin_current_cpu(void)
+{
+ struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
+
+ WARN_ON(hp->refcount <= 0);
+
+ /* This is safe. sync_unplug_thread is pinned to this cpu */
+ if (!--hp->refcount && hp->unplug && hp->unplug != current)
+ wake_up_process(hp->unplug);
+}
+
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (hp->refcount) {
+ schedule_preempt_disabled();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+}
+
+static int sync_unplug_thread(void *data)
+{
+ struct hotplug_pcp *hp = data;
+
+ wait_for_completion(&hp->unplug_wait);
+ preempt_disable();
+ hp->unplug = current;
+ wait_for_pinned_cpus(hp);
+
+ /*
+ * This thread will synchronize the cpu_down() with threads
+ * that have pinned the CPU. When the pinned CPU count reaches
+ * zero, we inform the cpu_down code to continue to the next step.
+ */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ preempt_enable();
+ complete(&hp->synced);
+
+ /*
+ * If all succeeds, the next step will need tasks to wait till
+ * the CPU is offline before continuing. To do this, the grab_lock
+ * is set and tasks going into pin_current_cpu() will block on the
+ * mutex. But we still need to wait for those that are already in
+ * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
+ * will kick this thread out.
+ */
+ while (!hp->grab_lock && !kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+
+ /* Make sure grab_lock is seen before we see a stale completion */
+ smp_mb();
+
+ /*
+ * Now just before cpu_down() enters stop machine, we need to make
+ * sure all tasks that are in pinned CPU sections are out, and new
+ * tasks will now grab the lock, keeping them from entering pinned
+ * CPU sections.
+ */
+ if (!kthread_should_stop()) {
+ preempt_disable();
+ wait_for_pinned_cpus(hp);
+ preempt_enable();
+ complete(&hp->synced);
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ set_current_state(TASK_RUNNING);
+
+ /*
+ * Force this thread off this CPU as it's going down and
+ * we don't want any more work on this CPU.
+ */
+ current->flags &= ~PF_NO_SETAFFINITY;
+ do_set_cpus_allowed(current, cpu_present_mask);
+ migrate_me();
+ return 0;
+}
+
+static void __cpu_unplug_sync(struct hotplug_pcp *hp)
+{
+ wake_up_process(hp->sync_tsk);
+ wait_for_completion(&hp->synced);
+}
+
+static void __cpu_unplug_wait(unsigned int cpu)
+{
+ struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+ complete(&hp->unplug_wait);
+ wait_for_completion(&hp->synced);
+}
+
+/*
+ * Start the sync_unplug_thread on the target cpu and wait for it to
+ * complete.
+ */
+static int cpu_unplug_begin(unsigned int cpu)
+{
+ struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+ int err;
+
+ /* Protected by cpu_hotplug.lock */
+ if (!hp->mutex_init) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+ spin_lock_init(&hp->lock);
+#else
+ mutex_init(&hp->mutex);
+#endif
+ hp->mutex_init = 1;
+ }
+
+ /* Inform the scheduler to migrate tasks off this CPU */
+ tell_sched_cpu_down_begin(cpu);
+
+ init_completion(&hp->synced);
+ init_completion(&hp->unplug_wait);
+
+ hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
+ if (IS_ERR(hp->sync_tsk)) {
+ err = PTR_ERR(hp->sync_tsk);
+ hp->sync_tsk = NULL;
+ return err;
+ }
+ kthread_bind(hp->sync_tsk, cpu);
+
+ /*
+ * Wait for tasks to get out of the pinned sections,
+ * it's still OK if new tasks enter. Some CPU notifiers will
+ * wait for tasks that are going to enter these sections and
+ * we must not have them block.
+ */
+ wake_up_process(hp->sync_tsk);
+ return 0;
+}
+
+static void cpu_unplug_sync(unsigned int cpu)
+{
+ struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+ init_completion(&hp->synced);
+ /* The completion needs to be initialzied before setting grab_lock */
+ smp_wmb();
+
+ /* Grab the mutex before setting grab_lock */
+ hotplug_lock(hp);
+ hp->grab_lock = 1;
+
+ /*
+ * The CPU notifiers have been completed.
+ * Wait for tasks to get out of pinned CPU sections and have new
+ * tasks block until the CPU is completely down.
+ */
+ __cpu_unplug_sync(hp);
+
+ /* All done with the sync thread */
+ kthread_stop(hp->sync_tsk);
+ hp->sync_tsk = NULL;
+}
+
+static void cpu_unplug_done(unsigned int cpu)
+{
+ struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+ hp->unplug = NULL;
+ /* Let all tasks know cpu unplug is finished before cleaning up */
+ smp_wmb();
+
+ if (hp->sync_tsk)
+ kthread_stop(hp->sync_tsk);
+
+ if (hp->grab_lock) {
+ hotplug_unlock(hp);
+ /* protected by cpu_hotplug.lock */
+ hp->grab_lock = 0;
+ }
+ tell_sched_cpu_down_done(cpu);
+}
+
void get_online_cpus(void)
{
might_sleep();
@@ -79,15 +363,14 @@ void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
+ mutex_lock(&cpu_hotplug.lock);
if (WARN_ON(!cpu_hotplug.refcount))
cpu_hotplug.refcount++; /* try to fix things up */
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
-
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -282,13 +565,15 @@ static int __ref take_cpu_down(void *_param)
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
- int err, nr_calls = 0;
+ int mycpu, err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.mod = mod,
.hcpu = hcpu,
};
+ cpumask_var_t cpumask;
+ cpumask_var_t cpumask_org;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -296,7 +581,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
+ /* Move the downtaker off the unplug cpu */
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL)) {
+ free_cpumask_var(cpumask);
+ return -ENOMEM;
+ }
+
+ cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
+ cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
+ set_cpus_allowed_ptr(current, cpumask);
+ free_cpumask_var(cpumask);
+ migrate_disable();
+ mycpu = smp_processor_id();
+ if (mycpu == cpu) {
+ printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
+ migrate_enable();
+ err = -EBUSY;
+ goto restore_cpus;
+ }
+ migrate_enable();
+
cpu_hotplug_begin();
+ err = cpu_unplug_begin(cpu);
+ if (err) {
+ printk("cpu_unplug_begin(%d) failed\n", cpu);
+ goto out_cancel;
+ }
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
@@ -322,8 +634,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
#endif
synchronize_rcu();
+ __cpu_unplug_wait(cpu);
smpboot_park_threads(cpu);
+ /* Notifiers are done. Don't let any more tasks pin this CPU. */
+ cpu_unplug_sync(cpu);
+
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
@@ -356,9 +672,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
check_for_tasks(cpu);
out_release:
+ cpu_unplug_done(cpu);
+out_cancel:
cpu_hotplug_done();
if (!err)
cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+restore_cpus:
+ set_cpus_allowed_ptr(current, cpumask_org);
+ free_cpumask_var(cpumask_org);
return err;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..399dba686b19 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -554,7 +554,6 @@ int vkdb_printf(const char *fmt, va_list ap)
int linecount;
int colcount;
int logging, saved_loglevel = 0;
- int saved_trap_printk;
int got_printf_lock = 0;
int retlen = 0;
int fnd, len;
@@ -565,8 +564,6 @@ int vkdb_printf(const char *fmt, va_list ap)
unsigned long uninitialized_var(flags);
preempt_disable();
- saved_trap_printk = kdb_trap_printk;
- kdb_trap_printk = 0;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
@@ -833,7 +830,6 @@ kdb_print_out:
} else {
__release(kdb_printf_lock);
}
- kdb_trap_printk = saved_trap_printk;
preempt_enable();
return retlen;
}
@@ -843,9 +839,11 @@ int kdb_printf(const char *fmt, ...)
va_list ap;
int r;
+ kdb_trap_printk++;
va_start(ap, fmt);
r = vkdb_printf(fmt, ap);
va_end(ap);
+ kdb_trap_printk--;
return r;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 69cffb46db17..5fbeded315e3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6139,6 +6139,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hwc->hrtimer.function = perf_swevent_hrtimer;
+ hwc->hrtimer.irqsafe = 1;
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
diff --git a/kernel/exit.c b/kernel/exit.c
index 81b3d6789ee8..3b93e6aa071d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -146,7 +146,7 @@ static void __exit_signal(struct task_struct *tsk)
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
- flush_sigqueue(&tsk->pending);
+ flush_task_sigqueue(tsk);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2c685396295..36652bef3430 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,7 +96,7 @@ int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
-__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+DEFINE_RWLOCK(tasklist_lock); /* outer */
#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
@@ -232,13 +232,16 @@ static inline void put_signal_struct(struct signal_struct *sig)
if (atomic_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
-
+#ifdef CONFIG_PREEMPT_RT_BASE
+static
+#endif
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
@@ -247,7 +250,18 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
EXPORT_SYMBOL_GPL(__put_task_struct);
+#else
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
+
+ __put_task_struct(tsk);
+
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+#endif
void __init __weak arch_task_cache_init(void) { }
@@ -600,6 +614,19 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+#ifdef CONFIG_PREEMPT_RT_BASE
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+ __mmdrop(mm);
+}
+#endif
+
/*
* Decrement the use count and release all resources for an mm.
*/
@@ -1113,6 +1140,9 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
*/
static void posix_cpu_timers_init(struct task_struct *tsk)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ tsk->posix_timer_list = NULL;
+#endif
tsk->cputime_expires.prof_exp = 0;
tsk->cputime_expires.virt_exp = 0;
tsk->cputime_expires.sched_exp = 0;
@@ -1240,6 +1270,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
+ p->sigqueue_cache = NULL;
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
@@ -1247,7 +1278,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_init(&p->vtime_seqlock);
+ raw_spin_lock_init(&p->vtime_lock);
+ seqcount_init(&p->vtime_seq);
p->vtime_snap = 0;
p->vtime_snap_whence = VTIME_SLEEPING;
#endif
@@ -1300,6 +1332,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+ p->pagefault_disabled = 0;
+#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index fda2950f2ce4..5ba3c0fc4d19 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -712,7 +712,9 @@ void exit_pi_state_list(struct task_struct *curr)
* task still owns the PI-state:
*/
if (head->next != next) {
+ raw_spin_unlock_irq(&curr->pi_lock);
spin_unlock(&hb->lock);
+ raw_spin_lock_irq(&curr->pi_lock);
continue;
}
@@ -1712,6 +1714,16 @@ retry_private:
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
+ } else if (ret == -EAGAIN) {
+ /*
+ * Waiter was woken by timeout or
+ * signal and has set pi_blocked_on to
+ * PI_WAKEUP_INPROGRESS before we
+ * tried to enqueue it on the rtmutex.
+ */
+ this->pi_state = NULL;
+ free_pi_state(pi_state);
+ continue;
} else if (ret) {
/* -EDEADLK */
this->pi_state = NULL;
@@ -2565,7 +2577,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
- struct futex_hash_bucket *hb;
+ struct futex_hash_bucket *hb, *hb2;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2590,10 +2602,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter, false);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2624,20 +2633,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out_put_keys;
+ /*
+ * On RT we must avoid races with requeue and trying to block
+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
+ * serializing access to pi_blocked_on with pi_lock.
+ */
+ raw_spin_lock_irq(&current->pi_lock);
+ if (current->pi_blocked_on) {
+ /*
+ * We have been requeued or are in the process of
+ * being requeued.
+ */
+ raw_spin_unlock_irq(&current->pi_lock);
+ } else {
+ /*
+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+ * prevents a concurrent requeue from moving us to the
+ * uaddr2 rtmutex. After that we can safely acquire
+ * (and possibly block on) hb->lock.
+ */
+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+ raw_spin_unlock_irq(&current->pi_lock);
+
+ spin_lock(&hb->lock);
+
+ /*
+ * Clean up pi_blocked_on. We might leak it otherwise
+ * when we succeeded with the hb->lock in the fast
+ * path.
+ */
+ raw_spin_lock_irq(&current->pi_lock);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock_irq(&current->pi_lock);
+
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
+ }
/*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
+ * In order to be here, we have either been requeued, are in
+ * the process of being requeued, or requeue successfully
+ * acquired uaddr2 on our behalf. If pi_blocked_on was
+ * non-null above, we may be racing with a requeue. Do not
+ * rely on q->lock_ptr to be hb2->lock until after blocking on
+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
+ * reference and incremented our key2 reference count.
*/
+ hb2 = hash_futex(&key2);
/* Check if the requeue code acquired the second futex for us. */
if (!q.rt_waiter) {
@@ -2646,9 +2690,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* did a lock-steal - fix up the PI-state in that case.
*/
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- spin_unlock(q.lock_ptr);
+ spin_unlock(&hb2->lock);
}
} else {
/*
@@ -2661,7 +2706,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
- spin_lock(q.lock_ptr);
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 04d03745fb98..f97b9f65c5fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,11 +48,13 @@
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/timer.h>
+#include <linux/kthread.h>
#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <trace/events/timer.h>
+#include <trace/events/hist.h>
/*
* The timer bases:
@@ -630,8 +632,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
* When the callback is running, we do not reprogram the clock event
* device. The timer callback is either running on a different CPU or
* the callback is executed in the hrtimer_interrupt context. The
- * reprogramming is handled either by the softirq, which called the
- * callback or at the end of the hrtimer_interrupt.
+ * reprogramming is handled at the end of the hrtimer_interrupt.
*/
if (hrtimer_callback_running(timer))
return 0;
@@ -666,6 +667,9 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return res;
}
+static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
+static int hrtimer_rt_defer(struct hrtimer *timer);
+
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -682,9 +686,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
- return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
+ if (!(base->cpu_base->hres_active && hrtimer_reprogram(timer, base)))
+ return 0;
+ if (!wakeup)
+ return -ETIME;
+#ifdef CONFIG_PREEMPT_RT_BASE
+ if (!hrtimer_rt_defer(timer))
+ return -ETIME;
+#endif
+ return 1;
}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -752,6 +765,44 @@ static void clock_was_set_work(struct work_struct *work)
static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT can not call schedule_work from real interrupt context.
+ * Need to make a thread to do the real work.
+ */
+static struct task_struct *clock_set_delay_thread;
+static bool do_clock_set_delay;
+
+static int run_clock_set_delay(void *ignore)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (do_clock_set_delay) {
+ do_clock_set_delay = false;
+ schedule_work(&hrtimer_work);
+ }
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+void clock_was_set_delayed(void)
+{
+ do_clock_set_delay = true;
+ /* Make visible before waking up process */
+ smp_wmb();
+ wake_up_process(clock_set_delay_thread);
+}
+
+static __init int create_clock_set_delay_thread(void)
+{
+ clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
+ BUG_ON(!clock_set_delay_thread);
+ return 0;
+}
+early_initcall(create_clock_set_delay_thread);
+#else /* PREEMPT_RT_FULL */
/*
* Called from timekeeping and resume code to reprogramm the hrtimer
* interrupt device on all cpus.
@@ -760,6 +811,7 @@ void clock_was_set_delayed(void)
{
schedule_work(&hrtimer_work);
}
+#endif
#else
@@ -769,12 +821,18 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ return 0;
+}
#endif /* CONFIG_HIGH_RES_TIMERS */
@@ -893,6 +951,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(hrtimer_forward);
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer: timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base = timer->base;
+
+ if (base && base->cpu_base && !timer->irqsafe)
+ wait_event(base->cpu_base->wait,
+ !(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b) do { } while (0)
+#endif
+
/*
* enqueue_hrtimer - internal function to (re)start a timer
*
@@ -936,6 +1020,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
goto out;
+ if (unlikely(!list_empty(&timer->cb_entry))) {
+ list_del_init(&timer->cb_entry);
+ goto out;
+ }
+
next_timer = timerqueue_getnext(&base->active);
timerqueue_del(&base->active, &timer->node);
if (&timer->node == next_timer) {
@@ -1024,6 +1113,17 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
timer_stats_hrtimer_set_start_info(timer);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ {
+ ktime_t now = new_base->get_time();
+
+ if (ktime_to_ns(tim) < ktime_to_ns(now))
+ timer->praecox = now;
+ else
+ timer->praecox = ktime_set(0, 0);
+ }
+#endif
+
leftmost = enqueue_hrtimer(timer, new_base);
/*
@@ -1032,9 +1132,19 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
*
* XXX send_remote_softirq() ?
*/
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
- && hrtimer_enqueue_reprogram(timer, new_base)) {
- if (wakeup) {
+ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) {
+ ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+ if (ret < 0) {
+ /*
+ * In case we failed to reprogram the timer (mostly
+ * because out current timer is already elapsed),
+ * remove it again and report a failure. This avoids
+ * stale base->first entries.
+ */
+ debug_deactivate(timer);
+ __remove_hrtimer(timer, new_base,
+ timer->state & HRTIMER_STATE_CALLBACK, 0);
+ } else if (ret > 0) {
/*
* We need to drop cpu_base->lock to avoid a
* lock ordering issue vs. rq->lock.
@@ -1042,9 +1152,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
raw_spin_unlock(&new_base->cpu_base->lock);
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
local_irq_restore(flags);
- return ret;
- } else {
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ return 0;
}
}
@@ -1134,7 +1242,7 @@ int hrtimer_cancel(struct hrtimer *timer)
if (ret >= 0)
return ret;
- cpu_relax();
+ hrtimer_wait_for_timer(timer);
}
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1213,6 +1321,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base = hrtimer_clockid_to_base(clock_id);
timer->base = &cpu_base->clock_base[base];
+ INIT_LIST_HEAD(&timer->cb_entry);
timerqueue_init(&timer->node);
#ifdef CONFIG_TIMER_STATS
@@ -1296,6 +1405,126 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
timer->state &= ~HRTIMER_STATE_CALLBACK;
}
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ /*
+ * Note, we clear the callback flag before we requeue the
+ * timer otherwise we trigger the callback_running() check
+ * in hrtimer_reprogram().
+ */
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+
+ if (restart != HRTIMER_NORESTART) {
+ BUG_ON(hrtimer_active(timer));
+ /*
+ * Enqueue the timer, if it's the leftmost timer then
+ * we need to reprogram it.
+ */
+ if (!enqueue_hrtimer(timer, base))
+ return;
+
+#ifndef CONFIG_HIGH_RES_TIMERS
+ }
+#else
+ if (base->cpu_base->hres_active &&
+ hrtimer_reprogram(timer, base))
+ goto requeue;
+
+ } else if (hrtimer_active(timer)) {
+ /*
+ * If the timer was rearmed on another CPU, reprogram
+ * the event device.
+ */
+ if (&timer->node == base->active.next &&
+ base->cpu_base->hres_active &&
+ hrtimer_reprogram(timer, base))
+ goto requeue;
+ }
+ return;
+
+requeue:
+ /*
+ * Timer is expired. Thus move it from tree to pending list
+ * again.
+ */
+ __remove_hrtimer(timer, base, timer->state, 0);
+ list_add_tail(&timer->cb_entry, &base->expired);
+#endif
+}
+
+/*
+ * The changes in mainline which removed the callback modes from
+ * hrtimer are not yet working with -rt. The non wakeup_process()
+ * based callbacks which involve sleeping locks need to be treated
+ * seperately.
+ */
+static void hrtimer_rt_run_pending(void)
+{
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ struct hrtimer_cpu_base *cpu_base;
+ struct hrtimer_clock_base *base;
+ struct hrtimer *timer;
+ int index, restart;
+
+ local_irq_disable();
+ cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+ raw_spin_lock(&cpu_base->lock);
+
+ for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+ base = &cpu_base->clock_base[index];
+
+ while (!list_empty(&base->expired)) {
+ timer = list_first_entry(&base->expired,
+ struct hrtimer, cb_entry);
+
+ /*
+ * Same as the above __run_hrtimer function
+ * just we run with interrupts enabled.
+ */
+ debug_hrtimer_deactivate(timer);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ timer_stats_account_hrtimer(timer);
+ fn = timer->function;
+
+ raw_spin_unlock_irq(&cpu_base->lock);
+ restart = fn(timer);
+ raw_spin_lock_irq(&cpu_base->lock);
+
+ hrtimer_rt_reprogram(restart, timer, base);
+ }
+ }
+
+ raw_spin_unlock_irq(&cpu_base->lock);
+
+ wake_up_timer_waiters(cpu_base);
+}
+
+static int hrtimer_rt_defer(struct hrtimer *timer)
+{
+ if (timer->irqsafe)
+ return 0;
+
+ __remove_hrtimer(timer, timer->base, timer->state, 0);
+ list_add_tail(&timer->cb_entry, &timer->base->expired);
+ return 1;
+}
+
+#else
+
+static inline void hrtimer_rt_run_pending(void)
+{
+ hrtimer_peek_ahead_timers();
+}
+
+static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
+
+#endif
+
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1306,7 +1535,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
- int i, retries = 0;
+ int i, retries = 0, raise = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
@@ -1341,6 +1570,15 @@ retry:
timer = container_of(node, struct hrtimer, node);
+ trace_hrtimer_interrupt(raw_smp_processor_id(),
+ ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
+ timer->praecox : hrtimer_get_expires(timer),
+ basenow)),
+ current,
+ timer->function == hrtimer_wakeup ?
+ container_of(timer, struct hrtimer_sleeper,
+ timer)->task : NULL);
+
/*
* The immediate goal for using the softexpires is
* minimizing wakeups, not running timers at the
@@ -1366,7 +1604,10 @@ retry:
break;
}
- __run_hrtimer(timer, &basenow);
+ if (!hrtimer_rt_defer(timer))
+ __run_hrtimer(timer, &basenow);
+ else
+ raise = 1;
}
}
@@ -1381,7 +1622,7 @@ retry:
if (expires_next.tv64 == KTIME_MAX ||
!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
- return;
+ goto out;
}
/*
@@ -1425,6 +1666,9 @@ retry:
tick_program_event(expires_next, 1);
printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
ktime_to_ns(delta));
+out:
+ if (raise)
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
/*
@@ -1460,40 +1704,16 @@ void hrtimer_peek_ahead_timers(void)
__hrtimer_peek_ahead_timers();
local_irq_restore(flags);
}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
- hrtimer_peek_ahead_timers();
-}
-
#else /* CONFIG_HIGH_RES_TIMERS */
static inline void __hrtimer_peek_ahead_timers(void) { }
#endif /* !CONFIG_HIGH_RES_TIMERS */
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
-void hrtimer_run_pending(void)
-{
- if (hrtimer_hres_active())
- return;
- /*
- * This _is_ ugly: We have to check in the softirq context,
- * whether we can switch to highres and / or nohz mode. The
- * clocksource switch happens in the timer interrupt with
- * xtime_lock held. Notification from there only sets the
- * check bit in the tick_oneshot code, otherwise we might
- * deadlock vs. xtime_lock.
- */
- if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
- hrtimer_switch_to_hres();
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+ hrtimer_rt_run_pending();
}
/*
@@ -1504,11 +1724,18 @@ void hrtimer_run_queues(void)
struct timerqueue_node *node;
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
- int index, gettime = 1;
+ int index, gettime = 1, raise = 0;
if (hrtimer_hres_active())
return;
+ /*
+ * Check whether we can switch to highres mode.
+ */
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())
+ && hrtimer_switch_to_hres())
+ return;
+
for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
base = &cpu_base->clock_base[index];
if (!timerqueue_getnext(&base->active))
@@ -1529,10 +1756,16 @@ void hrtimer_run_queues(void)
hrtimer_get_expires_tv64(timer))
break;
- __run_hrtimer(timer, &base->softirq_time);
+ if (!hrtimer_rt_defer(timer))
+ __run_hrtimer(timer, &base->softirq_time);
+ else
+ raise = 1;
}
raw_spin_unlock(&cpu_base->lock);
}
+
+ if (raise)
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
/*
@@ -1554,16 +1787,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
{
sl->timer.function = hrtimer_wakeup;
+ sl->timer.irqsafe = 1;
sl->task = task;
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
-static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
+ unsigned long state)
{
hrtimer_init_sleeper(t, current);
do {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(state);
hrtimer_start_expires(&t->timer, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
@@ -1607,7 +1842,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
- if (do_nanosleep(&t, HRTIMER_MODE_ABS))
+ /* cpu_chill() does not care about restart state. */
+ if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
goto out;
rmtp = restart->nanosleep.rmtp;
@@ -1624,8 +1860,10 @@ out:
return ret;
}
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
- const enum hrtimer_mode mode, const clockid_t clockid)
+static long
+__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ const enum hrtimer_mode mode, const clockid_t clockid,
+ unsigned long state)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
@@ -1638,7 +1876,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
hrtimer_init_on_stack(&t.timer, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
- if (do_nanosleep(&t, mode))
+ if (do_nanosleep(&t, mode, state))
goto out;
/* Absolute timers do not update the rmtp value and restart: */
@@ -1665,6 +1903,12 @@ out:
return ret;
}
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ const enum hrtimer_mode mode, const clockid_t clockid)
+{
+ return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
+}
+
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
@@ -1679,6 +1923,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
+ */
+void cpu_chill(void)
+{
+ struct timespec tu = {
+ .tv_nsec = NSEC_PER_MSEC,
+ };
+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
+
+ current->flags |= PF_NOFREEZE;
+ __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
+ TASK_UNINTERRUPTIBLE);
+ if (!freeze_flag)
+ current->flags &= ~PF_NOFREEZE;
+}
+EXPORT_SYMBOL(cpu_chill);
+#endif
+
/*
* Functions related to boot-time initialization:
*/
@@ -1690,9 +1954,13 @@ static void init_hrtimers_cpu(int cpu)
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
+ INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
}
hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_RT_BASE
+ init_waitqueue_head(&cpu_base->wait);
+#endif
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1805,9 +2073,7 @@ void __init hrtimers_init(void)
hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
}
/**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b497..7f50c558e46b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,6 +132,8 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
+ struct pt_regs *regs = get_irq_regs();
+ u64 ip = regs ? instruction_pointer(regs) : 0;
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
@@ -172,7 +174,11 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
action = action->next;
} while (action);
- add_interrupt_randomness(irq, flags);
+#ifndef CONFIG_PREEMPT_RT_FULL
+ add_interrupt_randomness(irq, flags, ip);
+#else
+ desc->random_ip = ip;
+#endif
if (!noirqdebug)
note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ebb8a9e937fa..3db46546fa6c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -22,6 +22,7 @@
#include "internals.h"
#ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
__read_mostly bool force_irqthreads;
static int __init setup_forced_irqthreads(char *arg)
@@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
+# endif
#endif
/**
@@ -143,6 +145,62 @@ static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void _irq_affinity_notify(struct irq_affinity_notify *notify);
+static struct task_struct *set_affinity_helper;
+static LIST_HEAD(affinity_list);
+static DEFINE_RAW_SPINLOCK(affinity_list_lock);
+
+static int set_affinity_thread(void *unused)
+{
+ while (1) {
+ struct irq_affinity_notify *notify;
+ int empty;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ raw_spin_lock_irq(&affinity_list_lock);
+ empty = list_empty(&affinity_list);
+ raw_spin_unlock_irq(&affinity_list_lock);
+
+ if (empty)
+ schedule();
+ if (kthread_should_stop())
+ break;
+ set_current_state(TASK_RUNNING);
+try_next:
+ notify = NULL;
+
+ raw_spin_lock_irq(&affinity_list_lock);
+ if (!list_empty(&affinity_list)) {
+ notify = list_first_entry(&affinity_list,
+ struct irq_affinity_notify, list);
+ list_del_init(&notify->list);
+ }
+ raw_spin_unlock_irq(&affinity_list_lock);
+
+ if (!notify)
+ continue;
+ _irq_affinity_notify(notify);
+ goto try_next;
+ }
+ return 0;
+}
+
+static void init_helper_thread(void)
+{
+ if (set_affinity_helper)
+ return;
+ set_affinity_helper = kthread_run(set_affinity_thread, NULL,
+ "affinity-cb");
+ WARN_ON(IS_ERR(set_affinity_helper));
+}
+#else
+
+static inline void init_helper_thread(void) { }
+
+#endif
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -181,7 +239,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+ raw_spin_lock(&affinity_list_lock);
+ if (list_empty(&desc->affinity_notify->list))
+ list_add_tail(&affinity_list,
+ &desc->affinity_notify->list);
+ raw_spin_unlock(&affinity_list_lock);
+ wake_up_process(set_affinity_helper);
+#else
schedule_work(&desc->affinity_notify->work);
+#endif
}
irqd_set(data, IRQD_AFFINITY_SET);
@@ -216,10 +284,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
-static void irq_affinity_notify(struct work_struct *work)
+static void _irq_affinity_notify(struct irq_affinity_notify *notify)
{
- struct irq_affinity_notify *notify =
- container_of(work, struct irq_affinity_notify, work);
struct irq_desc *desc = irq_to_desc(notify->irq);
cpumask_var_t cpumask;
unsigned long flags;
@@ -241,6 +307,13 @@ out:
kref_put(&notify->kref, notify->release);
}
+static void irq_affinity_notify(struct work_struct *work)
+{
+ struct irq_affinity_notify *notify =
+ container_of(work, struct irq_affinity_notify, work);
+ _irq_affinity_notify(notify);
+}
+
/**
* irq_set_affinity_notifier - control notification of IRQ affinity changes
* @irq: Interrupt for which to enable/disable notification
@@ -270,6 +343,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
notify->irq = irq;
kref_init(&notify->kref);
INIT_WORK(&notify->work, irq_affinity_notify);
+ INIT_LIST_HEAD(&notify->list);
+ init_helper_thread();
}
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -776,7 +851,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
local_bh_disable();
ret = action->thread_fn(action->irq, action->dev_id);
irq_finalize_oneshot(desc, action);
- local_bh_enable();
+ /*
+ * Interrupts which have real time requirements can be set up
+ * to avoid softirq processing in the thread handler. This is
+ * safe as these interrupts do not raise soft interrupts.
+ */
+ if (irq_settings_no_softirq_call(desc))
+ _local_bh_enable();
+ else
+ local_bh_enable();
return ret;
}
@@ -859,6 +942,12 @@ static int irq_thread(void *data)
if (action_ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ migrate_disable();
+ add_interrupt_randomness(action->irq, 0,
+ desc->random_ip ^ (unsigned long) action);
+ migrate_enable();
+#endif
wake_threads_waitq(desc);
}
@@ -1121,6 +1210,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
+ if (new->flags & IRQF_NO_SOFTIRQ_CALL)
+ irq_settings_set_no_softirq_call(desc);
+
/* Set default affinity mask once everything is setup */
setup_affinity(irq, desc, mask);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..34b803b89d41 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,6 +15,7 @@ enum {
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
+ _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -28,6 +29,7 @@ enum {
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
+#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -38,6 +40,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
}
+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
+}
+
+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
+}
+
static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..903a69c45689 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..35d21f93bbe8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -20,6 +20,9 @@
static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+#ifdef CONFIG_PREEMPT_RT_FULL
+static DEFINE_PER_CPU(struct llist_head, hirq_work_list);
+#endif
static DEFINE_PER_CPU(int, irq_work_raised);
/*
@@ -48,7 +51,11 @@ static bool irq_work_claim(struct irq_work *work)
return true;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+void arch_irq_work_raise(void)
+#else
void __weak arch_irq_work_raise(void)
+#endif
{
/*
* Lame architectures will get the timer tick callback
@@ -70,8 +77,12 @@ void irq_work_queue(struct irq_work *work)
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
- llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (work->flags & IRQ_WORK_HARD_IRQ)
+ llist_add(&work->llnode, &__get_cpu_var(hirq_work_list));
+ else
+#endif
+ llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
/*
* If the work is not "lazy" or the tick is stopped, raise the irq
* work interrupt (if supported by the arch), otherwise, just wait
@@ -115,12 +126,18 @@ static void __irq_work_run(void)
__this_cpu_write(irq_work_raised, 0);
barrier();
- this_list = &__get_cpu_var(irq_work_list);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (in_irq())
+ this_list = &__get_cpu_var(hirq_work_list);
+ else
+#endif
+ this_list = &__get_cpu_var(irq_work_list);
if (llist_empty(this_list))
return;
+#ifndef CONFIG_PREEMPT_RT_FULL
BUG_ON(!irqs_disabled());
-
+#endif
llnode = llist_del_all(this_list);
while (llnode != NULL) {
work = llist_entry(llnode, struct irq_work, llnode);
@@ -152,7 +169,9 @@ static void __irq_work_run(void)
*/
void irq_work_run(void)
{
+#ifndef CONFIG_PREEMPT_RT_FULL
BUG_ON(!in_irq());
+#endif
__irq_work_run();
}
EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 8d262b467573..d0513909d663 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -213,6 +213,7 @@ again:
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
spin_unlock_irq(&tsk->sighand->siglock);
+ hrtimer_wait_for_timer(&tsk->signal->real_timer);
goto again;
}
expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..30f00bcc6a34 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -132,6 +132,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_KEXEC */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+static ssize_t realtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -197,6 +206,9 @@ static struct attribute * kernel_attrs[] = {
&vmcoreinfo_attr.attr,
#endif
&rcu_expedited_attr.attr,
+#ifdef CONFIG_PREEMPT_RT_FULL
+ &realtime_attr.attr,
+#endif
NULL
};
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..ac0e23925127 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += semaphore.o lglock.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = -pg
@@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
endif
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-y += rwsem.o
+endif
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -20,6 +24,9 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+endif
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..9397974b142f 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -4,6 +4,15 @@
#include <linux/cpu.h>
#include <linux/string.h>
+#ifndef CONFIG_PREEMPT_RT_FULL
+# define lg_lock_ptr arch_spinlock_t
+# define lg_do_lock(l) arch_spin_lock(l)
+# define lg_do_unlock(l) arch_spin_unlock(l)
+#else
+# define lg_lock_ptr struct rt_mutex
+# define lg_do_lock(l) __rt_spin_lock(l)
+# define lg_do_unlock(l) __rt_spin_unlock(l)
+#endif
/*
* Note there is no uninit, so lglocks cannot be defined in
* modules (but it's fine to use them from there)
@@ -12,51 +21,60 @@
void lg_lock_init(struct lglock *lg, char *name)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
+
+ rt_mutex_init(lock);
+ }
+#endif
LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
}
EXPORT_SYMBOL(lg_lock_init);
void lg_local_lock(struct lglock *lg)
{
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
- preempt_disable();
+ migrate_disable();
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
+ lg_do_lock(lock);
}
EXPORT_SYMBOL(lg_local_lock);
void lg_local_unlock(struct lglock *lg)
{
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
+ lg_do_unlock(lock);
+ migrate_enable();
}
EXPORT_SYMBOL(lg_local_unlock);
void lg_local_lock_cpu(struct lglock *lg, int cpu)
{
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
- preempt_disable();
+ preempt_disable_nort();
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
+ lg_do_lock(lock);
}
EXPORT_SYMBOL(lg_local_lock_cpu);
void lg_local_unlock_cpu(struct lglock *lg, int cpu)
{
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
+ lg_do_unlock(lock);
+ preempt_enable_nort();
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
@@ -64,12 +82,12 @@ void lg_global_lock(struct lglock *lg)
{
int i;
- preempt_disable();
+ preempt_disable_nort();
lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
+ lg_do_lock(lock);
}
}
EXPORT_SYMBOL(lg_global_lock);
@@ -80,10 +98,35 @@ void lg_global_unlock(struct lglock *lg)
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
+ lg_lock_ptr *lock;
lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
+ lg_do_unlock(lock);
}
- preempt_enable();
+ preempt_enable_nort();
}
EXPORT_SYMBOL(lg_global_unlock);
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * HACK: If you use this, you get to keep the pieces.
+ * Used in queue_stop_cpus_work() when stop machinery
+ * is called from inactive CPU, so we can't schedule.
+ */
+# define lg_do_trylock_relax(l) \
+ do { \
+ while (!__rt_spin_trylock(l)) \
+ cpu_relax(); \
+ } while (0)
+
+void lg_global_trylock_relax(struct lglock *lg)
+{
+ int i;
+
+ lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ for_each_possible_cpu(i) {
+ lg_lock_ptr *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ lg_do_trylock_relax(lock);
+ }
+}
+#endif
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..b602d9df4601 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3543,6 +3543,7 @@ static void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -3557,6 +3558,7 @@ static void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..2db0f42d5c64 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -84,8 +84,12 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
down_read(&brw->rw_sem);
atomic_inc(&brw->slow_read_ctr);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ up_read(&brw->rw_sem);
+#else
/* avoid up_read()->rwsem_release() */
__up_read(&brw->rw_sem);
+#endif
}
void percpu_up_read(struct percpu_rw_semaphore *brw)
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
new file mode 100644
index 000000000000..90b8ba03e2a4
--- /dev/null
+++ b/kernel/locking/rt.c
@@ -0,0 +1,437 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ * (also by Steven Rostedt)
+ * - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ * Doing priority inheritance with help of the scheduler.
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * - major rework based on Esben Nielsens initial patch
+ * - replaced thread_info references by task_struct refs
+ * - removed task->pending_owner dependency
+ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ * in the scheduler return path as discussed with Steven Rostedt
+ *
+ * Copyright (C) 2006, Kihon Technologies Inc.
+ * Steven Rostedt <rostedt@goodmis.org>
+ * - debugged and patched Thomas Gleixner's rework.
+ * - added back the cmpxchg to the rework.
+ * - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * struct mutex functions
+ */
+void __mutex_do_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map(&mutex->dep_map, name, key, 0);
+#endif
+ mutex->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_lock_killable(&lock->lock, 0);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+ rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+ int ret;
+
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ ret = rt_mutex_lock_killable(&lock->lock, 0);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+ int ret = rt_mutex_trylock(&lock->lock);
+
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ migrate_disable();
+ ret = rt_mutex_trylock(&rwlock->lock);
+ if (ret)
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+ int ret;
+
+ *flags = 0;
+ ret = rt_write_trylock(rwlock);
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+ struct rt_mutex *lock = &rwlock->lock;
+ int ret = 1;
+
+ /*
+ * recursive read locks succeed when current owns the lock,
+ * but not when read_depth == 0 which means that the lock is
+ * write locked.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ migrate_disable();
+ ret = rt_mutex_trylock(lock);
+ if (ret)
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+
+ } else if (!rwlock->read_depth) {
+ ret = 0;
+ }
+
+ if (ret)
+ rwlock->read_depth++;
+
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ migrate_disable();
+ __rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+ struct rt_mutex *lock = &rwlock->lock;
+
+
+ /*
+ * recursive read locks succeed when current owns the lock
+ */
+ if (rt_mutex_owner(lock) != current) {
+ migrate_disable();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+ }
+ rwlock->read_depth++;
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ __rt_spin_unlock(&rwlock->lock);
+ migrate_enable();
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+ /* Release the lock only when read_depth is down to 0 */
+ if (--rwlock->read_depth == 0) {
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ __rt_spin_unlock(&rwlock->lock);
+ migrate_enable();
+ }
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+ rt_write_lock(rwlock);
+
+ return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+ rt_read_lock(rwlock);
+
+ return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+ rwlock->lock.save_state = 1;
+ rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void rt_up_write(struct rw_semaphore *rwsem)
+{
+ rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void rt_up_read(struct rw_semaphore *rwsem)
+{
+ rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+ BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+ int ret = rt_mutex_trylock(&rwsem->lock);
+
+ if (ret)
+ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void rt_down_write(struct rw_semaphore *rwsem)
+{
+ rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+ rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
+ struct lockdep_map *nest)
+{
+ rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested_lock);
+
+int rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+ int ret;
+
+ ret = rt_mutex_trylock(&rwsem->lock);
+ if (ret)
+ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+ rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+ rt_mutex_lock(&rwsem->lock);
+}
+
+void rt_down_read(struct rw_semaphore *rwsem)
+{
+ __rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+ __rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+ lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+ rwsem->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1ce0f6c6eb01..6c40660d1168 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,6 +8,12 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
+ *
* See Documentation/rt-mutex-design.txt for details.
*/
#include <linux/spinlock.h>
@@ -16,6 +22,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/timer.h>
+#include <linux/ww_mutex.h>
#include "rtmutex_common.h"
@@ -69,6 +76,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
clear_rt_mutex_waiters(lock);
}
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+ return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+ waiter != PI_REQUEUE_INPROGRESS;
+}
+
/*
* We can speed up the acquire/release, if the architecture
* supports cmpxchg and if there's no debugging state to be set up
@@ -265,6 +278,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
}
/*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+ if (!task_has_pi_waiters(task))
+ return 0;
+
+ return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+
+/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
@@ -295,6 +320,14 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (waiter->savestate)
+ wake_up_lock_sleeper(waiter->task);
+ else
+ wake_up_process(waiter->task);
+}
+
/*
* Max number of times we'll walk the boosting chain:
*/
@@ -377,7 +410,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* reached or the state of the chain has changed while we
* dropped the locks.
*/
- if (!waiter)
+ if (!rt_mutex_real_waiter(waiter))
goto out_unlock_pi;
/*
@@ -452,13 +485,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
if (!rt_mutex_owner(lock)) {
+ struct rt_mutex_waiter *lock_top_waiter;
+
/*
* If the requeue above changed the top waiter, then we need
* to wake the new top waiter up to try to get the lock.
*/
-
- if (top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ lock_top_waiter = rt_mutex_top_waiter(lock);
+ if (top_waiter != lock_top_waiter)
+ rt_mutex_wake_waiter(lock_top_waiter);
raw_spin_unlock(&lock->wait_lock);
goto out_put_task;
}
@@ -516,6 +551,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
return ret;
}
+
+#define STEAL_NORMAL 0
+#define STEAL_LATERAL 1
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *task,
+ struct task_struct *pendowner, int mode)
+{
+ if (mode == STEAL_NORMAL || rt_task(task)) {
+ if (task->prio >= pendowner->prio)
+ return 0;
+ } else if (task->prio > pendowner->prio)
+ return 0;
+ return 1;
+}
+
/*
* Try to take an rt-mutex
*
@@ -525,8 +579,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* @task: the task which wants to acquire the lock
* @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int
+__try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter, int mode)
{
/*
* We have to be careful here if the atomic speedups are
@@ -559,12 +614,14 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* 3) it is top waiter
*/
if (rt_mutex_has_waiters(lock)) {
- if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
- if (!waiter || waiter != rt_mutex_top_waiter(lock))
- return 0;
- }
+ struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
+
+ if (task != pown && !lock_is_stealable(task, pown, mode))
+ return 0;
}
+ /* We got the lock. */
+
if (waiter || rt_mutex_has_waiters(lock)) {
unsigned long flags;
struct rt_mutex_waiter *top;
@@ -588,7 +645,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
- /* We got the lock. */
debug_rt_mutex_lock(lock);
rt_mutex_set_owner(lock, task);
@@ -598,6 +654,13 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
return 1;
}
+static inline int
+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
+{
+ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
+}
+
/*
* Task blocks on lock.
*
@@ -629,6 +692,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * In the case of futex requeue PI, this will be a proxy
+ * lock. The task will wake unaware that it is enqueueed on
+ * this lock. Avoid blocking on two locks and corrupting
+ * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+ * flag. futex_wait_requeue_pi() sets this when it wakes up
+ * before requeue (due to a signal or timeout). Do not enqueue
+ * the task if PI_WAKEUP_INPROGRESS is set.
+ */
+ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return -EAGAIN;
+ }
+
+ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -652,7 +732,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on)
+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
chain_walk = 1;
} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
chain_walk = 1;
@@ -729,7 +809,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* long as we hold lock->wait_lock. The waiter task needs to
* acquire it in order to dequeue the waiter.
*/
- wake_up_process(waiter->task);
+ rt_mutex_wake_waiter(waiter);
}
/*
@@ -769,7 +849,8 @@ static void remove_waiter(struct rt_mutex *lock,
__rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
+ next_lock = task_blocked_on_lock(owner);
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
@@ -801,25 +882,376 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
+ if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
!dl_prio(task->prio))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
-
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * preemptible spin_lock functions:
+ */
+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ might_sleep();
+
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+ rt_mutex_deadlock_account_lock(lock, current);
+ else
+ slowfn(lock);
+}
+
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
+ */
+static int adaptive_wait(struct rt_mutex *lock,
+ struct task_struct *owner)
+{
+ int res = 0;
+
+ rcu_read_lock();
+ for (;;) {
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that owner->on_cpu is dereferenced _after_
+ * checking the above to be valid.
+ */
+ barrier();
+ if (!owner->on_cpu) {
+ res = 1;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
+}
+#else
+static int adaptive_wait(struct rt_mutex *lock,
+ struct task_struct *orig_owner)
+{
+ return 1;
+}
+#endif
+
+# define pi_lock(lock) raw_spin_lock_irq(lock)
+# define pi_unlock(lock) raw_spin_unlock_irq(lock)
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * We store the current state under p->pi_lock in p->saved_state and
+ * the try_to_wake_up() code handles this accordingly.
+ */
+static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+ struct task_struct *lock_owner, *self = current;
+ struct rt_mutex_waiter waiter, *top_waiter;
+ int ret;
+
+ rt_mutex_init_waiter(&waiter, true);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return;
+ }
+
+ BUG_ON(rt_mutex_owner(lock) == self);
+
+ /*
+ * We save whatever state the task is in and we'll restore it
+ * after acquiring the lock taking real wakeups into account
+ * as well. We are serialized via pi_lock against wakeups. See
+ * try_to_wake_up().
+ */
+ pi_lock(&self->pi_lock);
+ self->saved_state = self->state;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ pi_unlock(&self->pi_lock);
+
+ ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
+ BUG_ON(ret);
+
+ for (;;) {
+ /* Try to acquire the lock again. */
+ if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
+ break;
+
+ top_waiter = rt_mutex_top_waiter(lock);
+ lock_owner = rt_mutex_owner(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(&waiter);
+
+ if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
+ schedule_rt_mutex(lock);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ pi_lock(&self->pi_lock);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ pi_unlock(&self->pi_lock);
+ }
+
+ /*
+ * Restore the task state to current->saved_state. We set it
+ * to the original state above and the try_to_wake_up() code
+ * has possibly updated it when a real (non-rtmutex) wakeup
+ * happened while we were blocked. Clear saved_state so
+ * try_to_wakeup() does not get confused.
+ */
+ pi_lock(&self->pi_lock);
+ __set_current_state(self->saved_state);
+ self->saved_state = TASK_RUNNING;
+ pi_unlock(&self->pi_lock);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up:
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
+ BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void __sched __rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ raw_spin_unlock(&lock->wait_lock);
+ return;
+ }
+
+ wakeup_next_waiter(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* Undo pi boosting.when necessary */
+ rt_mutex_adjust_prio(current);
+}
+
+static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+ __rt_spin_lock_slowunlock(lock);
+}
+
+static void noinline __sched rt_spin_lock_slowunlock_hirq(struct rt_mutex *lock)
+{
+ int ret;
+
+ do {
+ ret = raw_spin_trylock(&lock->wait_lock);
+ } while (!ret);
+
+ __rt_spin_lock_slowunlock(lock);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_hirq);
+}
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_trylock(lock);
+}
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = rt_mutex_trylock(&lock->lock);
+
+ if (ret)
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ } else
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+ int ret;
+
+ *flags = 0;
+ ret = rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+ if (atomic_add_unless(atomic, -1, 1))
+ return 0;
+ migrate_disable();
+ rt_spin_lock(lock);
+ if (atomic_dec_and_test(atomic))
+ return 1;
+ rt_spin_unlock(lock);
+ migrate_enable();
+ return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif /* PREEMPT_RT_FULL */
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+ struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+
+ if (!hold_ctx)
+ return 0;
+
+ if (unlikely(ctx == hold_ctx))
+ return -EALREADY;
+
+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+ ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+#else
+static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ BUG();
+ return 0;
+}
+
+#endif
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
@@ -828,7 +1260,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
{
int ret = 0;
@@ -851,6 +1284,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
+ if (ww_ctx && ww_ctx->acquired > 0) {
+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ if (ret)
+ break;
+ }
+
raw_spin_unlock(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -884,25 +1323,101 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
}
}
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+ struct rt_mutex_waiter *waiter, *n;
+
+ /*
+ * This branch gets optimized out for the common case,
+ * and is only important for ww_mutex_lock.
+ */
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ ww->ctx = ww_ctx;
+
+ /*
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
+ */
+ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
+ tree_entry) {
+ /* XXX debug rt mutex waiter wakeup */
+
+ BUG_ON(waiter->lock != lock);
+ rt_mutex_wake_waiter(waiter);
+ }
+}
+
+#else
+
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ BUG();
+}
+#endif
+
/*
* Slow path lock function:
*/
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+ int detect_deadlock, struct ww_acquire_ctx *ww_ctx)
{
struct rt_mutex_waiter waiter;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter, false);
raw_spin_lock(&lock->wait_lock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
+ if (ww_ctx)
+ ww_mutex_account_lock(lock, ww_ctx);
raw_spin_unlock(&lock->wait_lock);
return 0;
}
@@ -919,13 +1434,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
if (likely(!ret))
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ww_ctx);
set_current_state(TASK_RUNNING);
if (unlikely(ret)) {
remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+ } else if (ww_ctx) {
+ ww_mutex_account_lock(lock, ww_ctx);
}
/*
@@ -953,7 +1470,8 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
{
int ret = 0;
- raw_spin_lock(&lock->wait_lock);
+ if (!raw_spin_trylock(&lock->wait_lock))
+ return ret;
if (likely(rt_mutex_owner(lock) != current)) {
@@ -1041,30 +1559,33 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int detect_deadlock,
+ int detect_deadlock, struct ww_acquire_ctx *ww_ctx,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ int detect_deadlock,
+ struct ww_acquire_ctx *ww_ctx))
{
if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, NULL, detect_deadlock);
+ return slowfn(lock, state, NULL, detect_deadlock, ww_ctx);
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout, int detect_deadlock,
+ struct ww_acquire_ctx *ww_ctx,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ int detect_deadlock,
+ struct ww_acquire_ctx *ww_ctx))
{
if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, timeout, detect_deadlock);
+ return slowfn(lock, state, timeout, detect_deadlock, ww_ctx);
}
static inline int
@@ -1097,19 +1618,19 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
{
might_sleep();
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
/**
* rt_mutex_lock_interruptible - lock a rt_mutex interruptible
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @detect_deadlock: deadlock detection on/off
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * 0 on success
+ * -EINTR when interrupted by a signal
* -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
@@ -1118,22 +1639,43 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
might_sleep();
return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
- detect_deadlock, rt_mutex_slowlock);
+ detect_deadlock, NULL, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_KILLABLE,
+ detect_deadlock, NULL, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
* by the caller
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
* @detect_deadlock: deadlock detection on/off
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * 0 on success
+ * -EINTR when interrupted by a signal
* -ETIMEDOUT when the timeout expired
* -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
@@ -1144,7 +1686,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
might_sleep();
return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- detect_deadlock, rt_mutex_slowlock);
+ detect_deadlock, NULL, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1202,13 +1744,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
void __rt_mutex_init(struct rt_mutex *lock, const char *name)
{
lock->owner = NULL;
- raw_spin_lock_init(&lock->wait_lock);
lock->waiters = RB_ROOT;
lock->waiters_leftmost = NULL;
debug_rt_mutex_init(lock, name);
}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+EXPORT_SYMBOL(__rt_mutex_init);
/**
* rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -1223,7 +1764,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
- __rt_mutex_init(lock, NULL);
+ rt_mutex_init(lock);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
rt_mutex_deadlock_account_lock(lock, proxy_owner);
@@ -1272,6 +1813,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return 1;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /*
+ * In PREEMPT_RT there's an added race.
+ * If the task, that we are about to requeue, times out,
+ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+ * to skip this task. But right after the task sets
+ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+ * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+ * This will replace the PI_WAKEUP_INPROGRESS with the actual
+ * lock that it blocks on. We *must not* place this task
+ * on this proxy lock in that case.
+ *
+ * To prevent this race, we first take the task's pi_lock
+ * and check if it has updated its pi_blocked_on. If it has,
+ * we assume that it woke up and we return -EAGAIN.
+ * Otherwise, we set the task's pi_blocked_on to
+ * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+ * it will know that we are in the process of requeuing it.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ if (task->pi_blocked_on) {
+ raw_spin_unlock_irq(&task->pi_lock);
+ raw_spin_unlock(&lock->wait_lock);
+ return -EAGAIN;
+ }
+ task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+ raw_spin_unlock_irq(&task->pi_lock);
+#endif
+
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
@@ -1342,7 +1912,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
set_current_state(TASK_RUNNING);
@@ -1359,3 +1929,88 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
return ret;
}
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
+ if (tmp > UINT_MAX/4)
+ tmp = UINT_MAX;
+ else
+ tmp = tmp*2 + tmp + tmp/2;
+
+ ctx->deadlock_inject_interval = tmp;
+ ctx->deadlock_inject_countdown = tmp;
+ ctx->contending_lock = lock;
+
+ ww_mutex_unlock(lock);
+
+ return -EDEADLK;
+ }
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire(&lock->base.dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ else if (!ret && ww_ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ww_ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map,
+ _RET_IP_);
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ else if (!ret && ww_ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ww_ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ rt_mutex_unlock(&lock->base.lock);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+#endif
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..ac636d37ec32 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -49,6 +49,7 @@ struct rt_mutex_waiter {
struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
+ bool savestate;
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
struct pid *deadlock_task_pid;
@@ -104,6 +105,9 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
/*
* PI-futex support (proxy locking functions, etc.):
*/
+#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
+
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
@@ -124,4 +128,14 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
# include "rtmutex.h"
#endif
+static inline void
+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ waiter->task = NULL;
+ waiter->savestate = savestate;
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
#endif
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..5c76166f88e2 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT_FULL
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !PREEMPT_RT_FULL */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..94970338d518 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT_FULL
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT_FULL
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..d7eb2453a18f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -368,9 +368,11 @@ static u64 oops_id;
static int init_oops_id(void)
{
+#ifndef CONFIG_PREEMPT_RT_FULL
if (!oops_id)
get_random_bytes(&oops_id, sizeof(oops_id));
else
+#endif
oops_id++;
return 0;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 3b8946416a5f..1764090d598d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -3,6 +3,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
@@ -640,7 +641,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
/*
* Disarm any old timer after extracting its expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE_NONRT(!irqs_disabled());
ret = 0;
old_incr = timer->it.cpu.incr;
@@ -1061,7 +1062,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
/*
* Now re-arm for the new expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE_NONRT(!irqs_disabled());
arm_timer(timer);
unlock_task_sighand(p, &flags);
@@ -1127,10 +1128,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
sig = tsk->signal;
if (sig->cputimer.running) {
struct task_cputime group_sample;
+ unsigned long flags;
- raw_spin_lock(&sig->cputimer.lock);
+ raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1144,13 +1146,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
* already updated our counts. We need to check if any timers fire now.
* Interrupts are disabled.
*/
-void run_posix_cpu_timers(struct task_struct *tsk)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
{
LIST_HEAD(firing);
struct k_itimer *timer, *next;
unsigned long flags;
- WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE_NONRT(!irqs_disabled());
/*
* The fast path checks that there are no expired thread or thread
@@ -1208,6 +1210,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
}
}
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+ int cpu = (long)data;
+
+ BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+ while (!kthread_should_stop()) {
+ struct task_struct *tsk = NULL;
+ struct task_struct *next = NULL;
+
+ if (cpu_is_offline(cpu))
+ goto wait_to_die;
+
+ /* grab task list */
+ raw_local_irq_disable();
+ tsk = per_cpu(posix_timer_tasklist, cpu);
+ per_cpu(posix_timer_tasklist, cpu) = NULL;
+ raw_local_irq_enable();
+
+ /* its possible the list is empty, just return */
+ if (!tsk) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ continue;
+ }
+
+ /* Process task list */
+ while (1) {
+ /* save next */
+ next = tsk->posix_timer_list;
+
+ /* run the task timers, clear its ptr and
+ * unreference it
+ */
+ __run_posix_cpu_timers(tsk);
+ tsk->posix_timer_list = NULL;
+ put_task_struct(tsk);
+
+ /* check if this is the last on the list */
+ if (next == tsk)
+ break;
+ tsk = next;
+ }
+ }
+ return 0;
+
+wait_to_die:
+ /* Wait for kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+ /* tsk == current, ensure it is safe to use ->signal/sighand */
+ if (unlikely(tsk->exit_state))
+ return 0;
+
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return 1;
+
+ if (!task_cputime_zero(&tsk->signal->cputime_expires))
+ return 1;
+
+ return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ unsigned long cpu = smp_processor_id();
+ struct task_struct *tasklist;
+
+ BUG_ON(!irqs_disabled());
+ if(!per_cpu(posix_timer_task, cpu))
+ return;
+ /* get per-cpu references */
+ tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+ /* check to see if we're already queued */
+ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
+ get_task_struct(tsk);
+ if (tasklist) {
+ tsk->posix_timer_list = tasklist;
+ } else {
+ /*
+ * The list is terminated by a self-pointing
+ * task_struct
+ */
+ tsk->posix_timer_list = tsk;
+ }
+ per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+ wake_up_process(per_cpu(posix_timer_task, cpu));
+ }
+}
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ struct task_struct *p;
+ struct sched_param param;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ p = kthread_create(posix_cpu_timers_thread, hcpu,
+ "posixcputmr/%d",cpu);
+ if (IS_ERR(p))
+ return NOTIFY_BAD;
+ p->flags |= PF_NOFREEZE;
+ kthread_bind(p, cpu);
+ /* Must be high prio to avoid getting starved */
+ param.sched_priority = MAX_RT_PRIO-1;
+ sched_setscheduler(p, SCHED_FIFO, &param);
+ per_cpu(posix_timer_task,cpu) = p;
+ break;
+ case CPU_ONLINE:
+ /* Strictly unneccessary, as first user will wake it. */
+ wake_up_process(per_cpu(posix_timer_task,cpu));
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ /* Unbind it from offline cpu so it can run. Fall thru. */
+ kthread_bind(per_cpu(posix_timer_task, cpu),
+ cpumask_any(cpu_online_mask));
+ kthread_stop(per_cpu(posix_timer_task,cpu));
+ per_cpu(posix_timer_task,cpu) = NULL;
+ break;
+ case CPU_DEAD:
+ kthread_stop(per_cpu(posix_timer_task,cpu));
+ per_cpu(posix_timer_task,cpu) = NULL;
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block posix_cpu_thread_notifier = {
+ .notifier_call = posix_cpu_thread_call,
+ .priority = 10
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+ void *hcpu = (void *)(long)smp_processor_id();
+ /* Start one for boot CPU. */
+ unsigned long cpu;
+
+ /* init the per-cpu posix_timer_tasklets */
+ for_each_possible_cpu(cpu)
+ per_cpu(posix_timer_tasklist, cpu) = NULL;
+
+ posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
+ posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
+ register_cpu_notifier(&posix_cpu_thread_notifier);
+ return 0;
+}
+early_initcall(posix_cpu_thread_init);
+#else /* CONFIG_PREEMPT_RT_BASE */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ __run_posix_cpu_timers(tsk);
+}
+#endif /* CONFIG_PREEMPT_RT_BASE */
+
/*
* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
* The tsk->sighand->siglock must be held by the caller.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 77e6b83c0431..5218a7d45505 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -497,6 +497,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
static struct pid *good_sigevent(sigevent_t * event)
{
struct task_struct *rtn = current->group_leader;
+ int sig = event->sigev_signo;
if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
@@ -505,7 +506,8 @@ static struct pid *good_sigevent(sigevent_t * event)
return NULL;
if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+ (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
+ sig_kernel_coredump(sig)))
return NULL;
return task_pid(rtn);
@@ -817,6 +819,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
return overrun;
}
+/*
+ * Protected by RCU!
+ */
+static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (kc->timer_set == common_timer_set)
+ hrtimer_wait_for_timer(&timr->it.real.timer);
+ else
+ /* FIXME: Whacky hack for posix-cpu-timers */
+ schedule_timeout(1);
+#endif
+}
+
/* Set a POSIX.1b interval timer. */
/* timr->it_lock is taken. */
static int
@@ -894,6 +910,7 @@ retry:
if (!timr)
return -EINVAL;
+ rcu_read_lock();
kc = clockid_to_kclock(timr->it_clock);
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
@@ -902,9 +919,12 @@ retry:
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
+ timer_wait_for_callback(kc, timr);
rtn = NULL; // We already got the old time...
+ rcu_read_unlock();
goto retry;
}
+ rcu_read_unlock();
if (old_setting && !error &&
copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
@@ -942,10 +962,15 @@ retry_delete:
if (!timer)
return -EINVAL;
+ rcu_read_lock();
if (timer_delete_hook(timer) == TIMER_RETRY) {
unlock_timer(timer, flags);
+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+ timer);
+ rcu_read_unlock();
goto retry_delete;
}
+ rcu_read_unlock();
spin_lock(&current->sighand->siglock);
list_del(&timer->list);
@@ -971,8 +996,18 @@ static void itimer_delete(struct k_itimer *timer)
retry_delete:
spin_lock_irqsave(&timer->it_lock, flags);
+ /* On RT we can race with a deletion */
+ if (!timer->it_signal) {
+ unlock_timer(timer, flags);
+ return;
+ }
+
if (timer_delete_hook(timer) == TIMER_RETRY) {
+ rcu_read_lock();
unlock_timer(timer, flags);
+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
+ timer);
+ rcu_read_unlock();
goto retry_delete;
}
list_del(&timer->list);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 126586a31408..190b4548a281 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -276,6 +276,8 @@ static int create_image(int platform_mode)
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (error) {
printk(KERN_ERR "PM: Some system devices failed to power down, "
@@ -303,6 +305,7 @@ static int create_image(int platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -428,6 +431,7 @@ static int resume_target_kernel(bool platform_mode)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
error = syscore_suspend();
if (error)
@@ -461,6 +465,7 @@ static int resume_target_kernel(bool platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -549,6 +554,7 @@ int hibernation_platform_enter(void)
goto Platform_finish;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
syscore_suspend();
if (pm_wakeup_pending()) {
error = -EAGAIN;
@@ -561,6 +567,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
enable_nonboot_cpus();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5455d5c3c149..c2372edf5857 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (!error) {
*wakeup = pm_wakeup_pending();
@@ -220,6 +222,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
syscore_resume();
}
+ system_state = SYSTEM_RUNNING;
+
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8c086e6049b9..376df3d58dd2 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1025,6 +1025,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
{
char *text;
int len = 0;
+ int attempts = 0;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
@@ -1036,7 +1037,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 seq;
u32 idx;
enum log_flags prev;
-
+ int num_msg;
+try_again:
+ attempts++;
+ if (attempts > 10) {
+ len = -EBUSY;
+ goto out;
+ }
+ num_msg = 0;
if (clear_seq < log_first_seq) {
/* messages are gone, move to first available one */
clear_seq = log_first_seq;
@@ -1057,6 +1065,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
prev = msg->flags;
idx = log_next(idx);
seq++;
+ num_msg++;
+ if (num_msg > 5) {
+ num_msg = 0;
+ raw_spin_unlock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
+ if (clear_seq < log_first_seq)
+ goto try_again;
+ }
}
/* move first record forward until length fits into the buffer */
@@ -1070,6 +1086,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
prev = msg->flags;
idx = log_next(idx);
seq++;
+ num_msg++;
+ if (num_msg > 5) {
+ num_msg = 0;
+ raw_spin_unlock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
+ if (clear_seq < log_first_seq)
+ goto try_again;
+ }
}
/* last message fitting into this dump */
@@ -1110,6 +1134,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
clear_seq = log_next_seq;
clear_idx = log_next_idx;
}
+out:
raw_spin_unlock_irq(&logbuf_lock);
kfree(text);
@@ -1267,6 +1292,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
if (!console_drivers)
return;
+ migrate_disable();
for_each_console(con) {
if (exclusive_console && con != exclusive_console)
continue;
@@ -1279,6 +1305,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
continue;
con->write(con, text, len);
}
+ migrate_enable();
}
/*
@@ -1338,12 +1365,18 @@ static inline int can_use_console(unsigned int cpu)
* interrupts disabled. It should return with 'lockbuf_lock'
* released but interrupts still disabled.
*/
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu, unsigned long flags)
__releases(&logbuf_lock)
{
int retval = 0, wake = 0;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ int lock = !early_boot_irqs_disabled && !irqs_disabled_flags(flags) &&
+ (preempt_count() <= 1);
+#else
+ int lock = 1;
+#endif
- if (console_trylock()) {
+ if (lock && console_trylock()) {
retval = 1;
/*
@@ -1483,6 +1516,62 @@ static size_t cont_print_text(char *text, size_t size)
return textlen;
}
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+ if (early_console) {
+ char buf[512];
+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+ early_console->write(early_console, buf, n);
+ }
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ early_vprintk(fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static bool __read_mostly printk_killswitch;
+
+static int __init force_early_printk_setup(char *str)
+{
+ printk_killswitch = true;
+ return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
+void printk_kill(void)
+{
+ printk_killswitch = true;
+}
+
+static int forced_early_printk(const char *fmt, va_list ap)
+{
+ if (!printk_killswitch)
+ return 0;
+ early_vprintk(fmt, ap);
+ return 1;
+}
+#else
+static inline int forced_early_printk(const char *fmt, va_list ap)
+{
+ return 0;
+}
+#endif
+
asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
@@ -1496,6 +1585,13 @@ asmlinkage int vprintk_emit(int facility, int level,
int this_cpu;
int printed_len = 0;
+ /*
+ * Fall back to early_printk if a debugging subsystem has
+ * killed printk output
+ */
+ if (unlikely(forced_early_printk(fmt, args)))
+ return 1;
+
boot_delay_msec(level);
printk_delay();
@@ -1618,8 +1714,15 @@ asmlinkage int vprintk_emit(int facility, int level,
* The console_trylock_for_printk() function will release 'logbuf_lock'
* regardless of whether it actually gets the console semaphore or not.
*/
- if (console_trylock_for_printk(this_cpu))
+ if (console_trylock_for_printk(this_cpu, flags)) {
+#ifndef CONFIG_PREEMPT_RT_FULL
console_unlock();
+#else
+ raw_local_irq_restore(flags);
+ console_unlock();
+ raw_local_irq_save(flags);
+#endif
+ }
lockdep_on();
out_restore_irqs:
@@ -1721,29 +1824,6 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
#endif /* CONFIG_PRINTK */
-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-void early_vprintk(const char *fmt, va_list ap)
-{
- if (early_console) {
- char buf[512];
- int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
- early_console->write(early_console, buf, n);
- }
-}
-
-asmlinkage void early_printk(const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- early_vprintk(fmt, ap);
- va_end(ap);
-}
-#endif
-
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
{
@@ -1984,11 +2064,16 @@ static void console_cont_flush(char *text, size_t size)
goto out;
len = cont_print_text(text, size);
+#ifndef CONFIG_PREEMPT_RT_FULL
raw_spin_unlock(&logbuf_lock);
stop_critical_timings();
call_console_drivers(cont.level, text, len);
start_critical_timings();
local_irq_restore(flags);
+#else
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ call_console_drivers(cont.level, text, len);
+#endif
return;
out:
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -2071,12 +2156,17 @@ skip:
console_idx = log_next(console_idx);
console_seq++;
console_prev = msg->flags;
- raw_spin_unlock(&logbuf_lock);
+#ifndef CONFIG_PREEMPT_RT_FULL
+ raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(level, text, len);
start_critical_timings();
local_irq_restore(flags);
+#else
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ call_console_drivers(level, text, len);
+#endif
}
console_locked = 0;
mutex_release(&console_lock_dep_map, 1, _RET_IP_);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..fddaf65cde4d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -135,7 +135,12 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !__fatal_signal_pending(task)) {
- task->state = __TASK_TRACED;
+ raw_spin_lock_irq(&task->pi_lock);
+ if (task->state & __TASK_TRACED)
+ task->state = __TASK_TRACED;
+ else
+ task->saved_state = __TASK_TRACED;
+ raw_spin_unlock_irq(&task->pi_lock);
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..14d847434d19 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -372,6 +372,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_sched);
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* Post an RCU bottom-half callback to be invoked after any subsequent
* quiescent state.
@@ -381,6 +382,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
__call_rcu(head, func, &rcu_bh_ctrlblk);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
void rcu_init(void)
{
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6705d947ef14..dc55b92ea5a8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,11 @@
#include <linux/random.h>
#include <linux/ftrace_event.h>
#include <linux/suspend.h>
+#include <linux/delay.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include "../time/tick-internal.h"
#include "tree.h"
#include <trace/events/rcu.h>
@@ -145,8 +150,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-#ifdef CONFIG_RCU_BOOST
-
/*
* Control variables for per-CPU and per-rcu_node kthreads. These
* handle all flavors of RCU.
@@ -156,8 +159,6 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -199,6 +200,19 @@ void rcu_sched_qs(int cpu)
rdp->passed_quiesce = 1;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void rcu_preempt_qs(int cpu);
+
+void rcu_bh_qs(int cpu)
+{
+ unsigned long flags;
+
+ /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
+ local_irq_save(flags);
+ rcu_preempt_qs(cpu);
+ local_irq_restore(flags);
+}
+#else
void rcu_bh_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
@@ -207,6 +221,7 @@ void rcu_bh_qs(int cpu)
trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
rdp->passed_quiesce = 1;
}
+#endif
/*
* Note a context switch. This is a quiescent state for RCU-sched,
@@ -263,6 +278,7 @@ long rcu_batches_completed_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* Return the number of RCU BH batches processed thus far for debug & stats.
*/
@@ -280,6 +296,7 @@ void rcu_bh_force_quiescent_state(void)
force_quiescent_state(&rcu_bh_state);
}
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+#endif
/*
* Record the number of times rcutorture tests have been initiated and
@@ -1240,7 +1257,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!ACCESS_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
- wake_up(&rsp->gp_wq);
+ swait_wake(&rsp->gp_wq);
}
/*
@@ -1609,7 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("reqwait"));
- wait_event_interruptible(rsp->gp_wq,
+ swait_event_interruptible(rsp->gp_wq,
ACCESS_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
/* Locking provides needed memory barrier. */
@@ -1636,7 +1653,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqswait"));
- ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_interruptible_timeout(rsp->gp_wq,
((gf = ACCESS_ONCE(rsp->gp_flags)) &
RCU_GP_FLAG_FQS) ||
(!ACCESS_ONCE(rnp->qsmask) &&
@@ -2378,16 +2395,14 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/*
* Do RCU core processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void rcu_process_callbacks(void)
{
struct rcu_state *rsp;
if (cpu_is_offline(smp_processor_id()))
return;
- trace_rcu_utilization(TPS("Start RCU core"));
for_each_rcu_flavor(rsp)
__rcu_process_callbacks(rsp);
- trace_rcu_utilization(TPS("End RCU core"));
}
/*
@@ -2401,18 +2416,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
return;
- if (likely(!rsp->boost)) {
- rcu_do_batch(rsp, rdp);
+ rcu_do_batch(rsp, rdp);
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+ wake_up_process(t);
+}
+
+/*
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
+ */
+static void invoke_rcu_core(void)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!cpu_online(smp_processor_id()))
return;
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ t = __this_cpu_read(rcu_cpu_kthread_task);
+ if (t != NULL && current != t)
+ rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
+ local_irq_restore(flags);
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+ char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_process_callbacks();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
}
- invoke_rcu_callbacks_kthread();
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
}
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
{
- if (cpu_online(smp_processor_id()))
- raise_softirq(RCU_SOFTIRQ);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ return 0;
}
+early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2543,6 +2645,7 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_sched);
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* Queue an RCU callback for invocation after a quicker grace period.
*/
@@ -2551,6 +2654,7 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
__call_rcu(head, func, &rcu_bh_state, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+#endif
/*
* Because a context switch is a grace period for RCU-sched and RCU-bh,
@@ -2628,6 +2732,7 @@ void synchronize_sched(void)
}
EXPORT_SYMBOL_GPL(synchronize_sched);
+#ifndef CONFIG_PREEMPT_RT_FULL
/**
* synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
*
@@ -2654,6 +2759,7 @@ void synchronize_rcu_bh(void)
wait_rcu_gp(call_rcu_bh);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+#endif
static int synchronize_sched_expedited_cpu_stop(void *data)
{
@@ -3074,6 +3180,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
mutex_unlock(&rsp->barrier_mutex);
}
+#ifndef CONFIG_PREEMPT_RT_FULL
/**
* rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
*/
@@ -3082,6 +3189,7 @@ void rcu_barrier_bh(void)
_rcu_barrier(&rcu_bh_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+#endif
/**
* rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
@@ -3385,7 +3493,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rsp->rda = rda;
- init_waitqueue_head(&rsp->gp_wq);
+ init_swait_head(&rsp->gp_wq);
init_irq_work(&rsp->wakeup_work, rsp_wakeup);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
@@ -3483,7 +3591,6 @@ void __init rcu_init(void)
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
__rcu_init_preempt();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..f8a66923f6e2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -28,6 +28,7 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>
#include <linux/irq_work.h>
+#include <linux/wait-simple.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -200,7 +201,7 @@ struct rcu_node {
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
- wait_queue_head_t nocb_gp_wq[2];
+ struct swait_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
@@ -334,7 +335,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* (approximate). */
int nocb_p_count; /* # CBs being invoked by kthread */
int nocb_p_count_lazy; /* (approximate). */
- wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
@@ -405,7 +406,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
- wait_queue_head_t gp_wq; /* Where GP task waits. */
+ struct swait_head gp_wq; /* Where GP task waits. */
int gp_flags; /* Commands for GP task. */
/* End of fields guarded by root rcu_node's lock. */
@@ -531,10 +532,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..e8e67dac6f3e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,12 +24,6 @@
* Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/smpboot.h>
-#include "../time/tick-internal.h"
-
#define RCU_KTHREAD_PRIO 1
#ifdef CONFIG_RCU_BOOST
@@ -370,7 +364,7 @@ void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block, complain if they get here. */
- if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+ if (WARN_ON_ONCE(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET))) {
local_irq_restore(flags);
return;
}
@@ -670,15 +664,6 @@ static void rcu_preempt_check_callbacks(int cpu)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/*
* Queue a preemptible-RCU callback for invocation after a grace period.
*/
@@ -1146,6 +1131,19 @@ void exit_rcu(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
+#ifdef CONFIG_RCU_BOOST
+ struct sched_param sp;
+
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
#ifdef CONFIG_RCU_BOOST
#include "../locking/rtmutex_common.h"
@@ -1177,16 +1175,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
- /*
- * If the thread is yielding, only wake it when this
- * is invoked from idle
- */
- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
- wake_up_process(t);
-}
-
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1331,23 +1319,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
- __this_cpu_read(rcu_cpu_kthread_status));
- }
- local_irq_restore(flags);
-}
-
-/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
@@ -1402,67 +1373,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
- rcu_preempt_do_callbacks();
-}
-
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
- struct sched_param sp;
-
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
- return __this_cpu_read(rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
- * RCU softirq used in flavors and configurations of RCU that do not
- * support RCU priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
- int spincnt;
-
- for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
- local_bh_disable();
- *statusp = RCU_KTHREAD_RUNNING;
- this_cpu_inc(rcu_cpu_kthread_loops);
- local_irq_disable();
- work = *workp;
- *workp = 0;
- local_irq_enable();
- if (work)
- rcu_kthread_do_work();
- local_bh_enable();
- if (*workp == 0) {
- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
- *statusp = RCU_KTHREAD_WAITING;
- return;
- }
- }
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
- *statusp = RCU_KTHREAD_WAITING;
-}
-
/*
* Set the per-rcu_node kthread's affinity to cover all CPUs that are
* served by the rcu_node in question. The CPU hotplug lock is still
@@ -1496,27 +1406,14 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
free_cpumask_var(cm);
}
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_cpu_kthread_task,
- .thread_should_run = rcu_cpu_kthread_should_run,
- .thread_fn = rcu_cpu_kthread,
- .thread_comm = "rcuc/%u",
- .setup = rcu_cpu_kthread_setup,
- .park = rcu_cpu_kthread_park,
-};
-
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
static int __init rcu_spawn_kthreads(void)
{
struct rcu_node *rnp;
- int cpu;
rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu)
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
rnp = rcu_get_root(rcu_state);
(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
@@ -1544,11 +1441,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
static bool rcu_is_callbacks_kthread(void)
{
return false;
@@ -1575,7 +1467,7 @@ static void rcu_prepare_kthreads(int cpu)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
/*
* Check to see if any future RCU-related work will need to be done
@@ -1591,6 +1483,9 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
*delta_jiffies = ULONG_MAX;
return rcu_cpu_has_callbacks(cpu, NULL);
}
+#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1688,6 +1583,8 @@ static bool rcu_try_advance_all_cbs(void)
return cbs_ready;
}
+#ifndef CONFIG_PREEMPT_RT_FULL
+
/*
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
* to invoke. If the CPU has callbacks, try to advance them. Tell the
@@ -1726,6 +1623,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
}
return 0;
}
+#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -2079,7 +1977,7 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp)
*/
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
- wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+ swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
}
/*
@@ -2097,8 +1995,8 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- init_waitqueue_head(&rnp->nocb_gp_wq[0]);
- init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+ init_swait_head(&rnp->nocb_gp_wq[0]);
+ init_swait_head(&rnp->nocb_gp_wq[1]);
}
/* Is the specified CPU a no-CPUs CPU? */
@@ -2143,7 +2041,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
- wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ swait_wake(&rdp->nocb_wq); /* ... if queue was empty ... */
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
@@ -2247,7 +2145,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
- wait_event_interruptible(
+ swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
if (likely(d))
@@ -2278,7 +2176,7 @@ static int rcu_nocb_kthread(void *arg)
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("Sleep"));
- wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+ swait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
/* Memory barrier provide by xchg() below. */
} else if (firsttime) {
firsttime = 0;
@@ -2352,7 +2250,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_up(&rdp->nocb_wq);
+ swait_wake(&rdp->nocb_wq);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
}
@@ -2360,7 +2258,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
- init_waitqueue_head(&rdp->nocb_wq);
+ init_swait_head(&rdp->nocb_wq);
}
/* Create a kthread for each RCU flavor for each no-CBs CPU. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..9fb149fa7db9 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -140,6 +140,7 @@ int notrace debug_lockdep_rcu_enabled(void)
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+#ifndef CONFIG_PREEMPT_RT_FULL
/**
* rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
@@ -166,6 +167,7 @@ int rcu_read_lock_bh_held(void)
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..b91551397970 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data)
{
struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
+ /*
+ * Stupid polling for now:
+ */
+ mod_timer(&buf->timer, jiffies + 1);
}
/**
@@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ mod_timer(&buf->timer, jiffies + 1);
} else
del_timer_sync(&buf->timer);
@@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
else
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
- smp_mb();
- if (waitqueue_active(&buf->read_wait))
- /*
- * Calling wake_up_interruptible() from here
- * will deadlock if we happen to be logging
- * from the scheduler (trying to re-grab
- * rq->lock), so defer it.
- */
- mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 4aa8a305aede..3fbcb0d712bb 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -49,7 +49,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
r = ret = 0;
*limit_fail_at = NULL;
- local_irq_save(flags);
+ local_irq_save_nort(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
r = res_counter_charge_locked(c, val, force);
@@ -69,7 +69,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
spin_unlock(&u->lock);
}
}
- local_irq_restore(flags);
+ local_irq_restore_nort(flags);
return ret;
}
@@ -103,7 +103,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
struct res_counter *c;
u64 ret = 0;
- local_irq_save(flags);
+ local_irq_save_nort(flags);
for (c = counter; c != top; c = c->parent) {
u64 r;
spin_lock(&c->lock);
@@ -112,7 +112,7 @@ u64 res_counter_uncharge_until(struct res_counter *counter,
ret = r;
spin_unlock(&c->lock);
}
- local_irq_restore(flags);
+ local_irq_restore_nort(flags);
return ret;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..b14a512da520 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o proc.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o wait-simple.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..e529e5fcedfb 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -30,10 +30,10 @@ void complete(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ __swait_wake_locked(&x->wait, TASK_NORMAL, 1);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -50,10 +50,10 @@ void complete_all(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ __swait_wake_locked(&x->wait, TASK_NORMAL, 0);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
@@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DEFINE_SWAITER(wait);
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ swait_prepare_locked(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ swait_finish_locked(&x->wait, &wait);
if (!x->done)
return timeout;
}
@@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
{
might_sleep();
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
return timeout;
}
@@ -267,12 +267,12 @@ bool try_wait_for_completion(struct completion *x)
unsigned long flags;
int ret = 1;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
else
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -290,10 +290,10 @@ bool completion_done(struct completion *x)
unsigned long flags;
int ret = 1;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a3f3c4e1f5a..8181ed6b738a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -272,7 +272,11 @@ late_initcall(sched_init_debug);
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
+#ifndef CONFIG_PREEMPT_RT_FULL
const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
/*
* period over which we average the RT time consumption, measured
@@ -489,6 +493,7 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
+ rq->hrtick_timer.irqsafe = 1;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -534,6 +539,37 @@ void resched_task(struct task_struct *p)
smp_send_reschedule(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_task(p);
+ return;
+ }
+
+ lockdep_assert_held(&task_rq(p)->lock);
+
+ if (test_tsk_need_resched(p))
+ return;
+
+ if (test_tsk_need_resched_lazy(p))
+ return;
+
+ set_tsk_need_resched_lazy(p);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(p))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1122,6 +1158,18 @@ struct migration_arg {
static int migration_cpu_stop(void *data);
+static bool check_task_state(struct task_struct *p, long match_state)
+{
+ bool match = false;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ if (p->state == match_state || p->saved_state == match_state)
+ match = true;
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+}
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1166,7 +1214,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ if (match_state && !check_task_state(p, match_state))
return 0;
cpu_relax();
}
@@ -1181,7 +1229,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
on_rq = p->on_rq;
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if (!match_state || p->state == match_state
+ || p->saved_state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &flags);
@@ -1406,10 +1455,6 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = 1;
-
- /* if a worker is waking up, notify workqueue */
- if (p->flags & PF_WQ_WORKER)
- wq_worker_waking_up(p, cpu_of(rq));
}
/*
@@ -1592,8 +1637,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
- if (!(p->state & state))
+ if (!(p->state & state)) {
+ /*
+ * The task might be running due to a spinlock sleeper
+ * wakeup. Check the saved state and set it to running
+ * if the wakeup condition is true.
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ success = 1;
+ }
+ }
goto out;
+ }
+
+ /*
+ * If this is a regular wakeup, then we can unconditionally
+ * clear the saved state of a "lock sleeper".
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER))
+ p->saved_state = TASK_RUNNING;
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1636,42 +1700,6 @@ out:
}
/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
- struct rq *rq = task_rq(p);
-
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
- return;
-
- lockdep_assert_held(&rq->lock);
-
- if (!raw_spin_trylock(&p->pi_lock)) {
- raw_spin_unlock(&rq->lock);
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- }
-
- if (!(p->state & TASK_NORMAL))
- goto out;
-
- if (!p->on_rq)
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
- ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
-out:
- raw_spin_unlock(&p->pi_lock);
-}
-
-/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -1685,11 +1713,23 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- WARN_ON(task_is_stopped_or_traced(p));
+ WARN_ON(__task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
@@ -1867,6 +1907,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -2150,11 +2193,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
+ /*
+ * We use mmdrop_delayed() here so we don't have to do the
+ * full __mmdrop() when we are the last user.
+ */
if (mm)
- mmdrop(mm);
+ mmdrop_delayed(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2514,8 +2559,13 @@ void __kprobes preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
EXPORT_SYMBOL(preempt_count_add);
@@ -2558,6 +2608,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -2581,6 +2638,133 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */
+#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
+#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
+
+static inline void update_migrate_disable(struct task_struct *p)
+{
+ const struct cpumask *mask;
+
+ if (likely(!p->migrate_disable))
+ return;
+
+ /* Did we already update affinity? */
+ if (unlikely(migrate_disabled_updated(p)))
+ return;
+
+ /*
+ * Since this is always current we can get away with only locking
+ * rq->lock, the ->cpus_allowed value can normally only be changed
+ * while holding both p->pi_lock and rq->lock, but seeing that this
+ * is current, we cannot actually be waking up, so all code that
+ * relies on serialization against p->pi_lock is out of scope.
+ *
+ * Having rq->lock serializes us against things like
+ * set_cpus_allowed_ptr() that can still happen concurrently.
+ */
+ mask = tsk_cpus_allowed(p);
+
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, mask);
+ /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */
+ p->nr_cpus_allowed = 1;
+
+ /* Let migrate_enable know to fix things back up */
+ p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic++;
+#endif
+ return;
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (unlikely(p->migrate_disable_atomic)) {
+ tracing_off();
+ WARN_ON_ONCE(1);
+ }
+#endif
+
+ if (p->migrate_disable) {
+ p->migrate_disable++;
+ return;
+ }
+
+ preempt_disable();
+ preempt_lazy_disable();
+ pin_current_cpu();
+ p->migrate_disable = 1;
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+ const struct cpumask *mask;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic--;
+#endif
+ return;
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (unlikely(p->migrate_disable_atomic)) {
+ tracing_off();
+ WARN_ON_ONCE(1);
+ }
+#endif
+ WARN_ON_ONCE(p->migrate_disable <= 0);
+
+ if (migrate_disable_count(p) > 1) {
+ p->migrate_disable--;
+ return;
+ }
+
+ preempt_disable();
+ if (unlikely(migrate_disabled_updated(p))) {
+ /*
+ * Undo whatever update_migrate_disable() did, also see there
+ * about locking.
+ */
+ rq = this_rq();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * Clearing migrate_disable causes tsk_cpus_allowed to
+ * show the tasks original cpu affinity.
+ */
+ p->migrate_disable = 0;
+ mask = tsk_cpus_allowed(p);
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, mask);
+ p->nr_cpus_allowed = cpumask_weight(mask);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ } else
+ p->migrate_disable = 0;
+
+ unpin_current_cpu();
+ preempt_enable();
+ preempt_lazy_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static inline void update_migrate_disable(struct task_struct *p) { }
+#define migrate_disabled_updated(p) 0
+#endif
+
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
if (prev->on_rq || rq->skip_clock_update < 0)
@@ -2680,6 +2864,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ update_migrate_disable(prev);
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2687,19 +2873,6 @@ need_resched:
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
-
- /*
- * If a worker went to sleep, notify and ask workqueue
- * whether it wants to wake up a task to maintain
- * concurrency.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
- to_wakeup = wq_worker_sleeping(prev, cpu);
- if (to_wakeup)
- try_to_wake_up_local(to_wakeup);
- }
}
switch_count = &prev->nvcsw;
}
@@ -2712,6 +2885,7 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -2741,8 +2915,19 @@ need_resched:
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
+ return;
+ /*
+ * If a worker went to sleep, notify and ask workqueue whether
+ * it wants to wake up a task to maintain concurrency.
+ */
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+
+
+ if (tsk_is_pi_blocked(tsk))
return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -2751,12 +2936,19 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
+static inline void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+}
+
asmlinkage void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
__schedule();
+ sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -2802,9 +2994,26 @@ asmlinkage void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
+#ifdef CONFIG_PREEMPT_LAZY
+ /*
+ * Check for lazy preemption
+ */
+ if (current_thread_info()->preempt_lazy_count &&
+ !test_thread_flag(TIF_NEED_RESCHED))
+ return;
+#endif
do {
__preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * The add/subtract must not be traced by the function
+ * tracer. But we still want to account for the
+ * preempt off latency tracer. Since the _notrace versions
+ * of add/subtract skip the accounting for latency tracer
+ * we must force it manually.
+ */
+ start_critical_timings();
__schedule();
+ stop_critical_timings();
__preempt_count_sub(PREEMPT_ACTIVE);
/*
@@ -2912,7 +3121,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
@@ -3193,9 +3403,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_new = 1;
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
@@ -3215,9 +3424,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* getparam()/getattr() don't report silly values for !rt tasks.
*/
p->rt_priority = attr->sched_priority;
-
p->normal_prio = normal_prio(p);
- p->prio = rt_mutex_getprio(p);
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3225,8 +3446,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
-
- set_load_weight(p);
}
static void
@@ -3302,6 +3521,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
int policy = attr->sched_policy;
unsigned long flags;
@@ -3426,6 +3647,7 @@ recheck:
if (dl_policy(policy))
goto change;
+ p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -3479,6 +3701,24 @@ change:
return -EBUSY;
}
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -3486,16 +3726,18 @@ change:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4104,9 +4346,16 @@ SYSCALL_DEFINE0(sched_yield)
static void __cond_resched(void)
{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ /*
+ * Check again in case we missed a preemption
+ * opportunity between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
}
int __sched _cond_resched(void)
@@ -4147,6 +4396,7 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
+#ifndef CONFIG_PREEMPT_RT_FULL
int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
@@ -4160,6 +4410,7 @@ int __sched __cond_resched_softirq(void)
return 0;
}
EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
/**
* yield - yield the current processor to other threads.
@@ -4513,6 +4764,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4520,7 +4772,9 @@ void init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
@@ -4535,11 +4789,90 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
+ if (!migrate_disabled_updated(p)) {
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+ }
cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+ mutex_lock(&sched_down_mutex);
+ cpumask_set_cpu(cpu, &sched_down_cpumask);
+ mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+ mutex_lock(&sched_down_mutex);
+ cpumask_clear_cpu(cpu, &sched_down_cpumask);
+ mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ * 0 otherwise.
+ */
+int migrate_me(void)
+{
+ struct task_struct *p = current;
+ struct migration_arg arg;
+ struct cpumask *cpumask;
+ struct cpumask *mask;
+ unsigned long flags;
+ unsigned int dest_cpu;
+ struct rq *rq;
+
+ /*
+ * We can not migrate tasks bounded to a CPU or tasks not
+ * running. The movement of the task will wake it up.
+ */
+ if (p->flags & PF_NO_SETAFFINITY || p->state)
+ return 0;
+
+ mutex_lock(&sched_down_mutex);
+ rq = task_rq_lock(p, &flags);
+
+ cpumask = &__get_cpu_var(sched_cpumasks);
+ mask = &p->cpus_allowed;
+
+ cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+ if (!cpumask_weight(cpumask)) {
+ /* It's only on this CPU? */
+ task_rq_unlock(rq, p, &flags);
+ mutex_unlock(&sched_down_mutex);
+ return 0;
+ }
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+ arg.task = p;
+ arg.dest_cpu = dest_cpu;
+
+ task_rq_unlock(rq, p, &flags);
+
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ mutex_unlock(&sched_down_mutex);
+
+ return 1;
}
/*
@@ -4585,7 +4918,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4722,6 +5055,8 @@ static int migration_cpu_stop(void *data)
#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -4734,7 +5069,12 @@ void idle_task_exit(void)
if (mm != &init_mm)
switch_mm(mm, &init_mm, current);
- mmdrop(mm);
+
+ /*
+ * Defer the cleanup to an alive cpu. On RT we can neither
+ * call mmdrop() nor mmdrop_delayed() from here.
+ */
+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
}
/*
@@ -5058,6 +5398,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD:
calc_load_migrate(rq);
+ if (per_cpu(idle_last_mm, cpu)) {
+ mmdrop(per_cpu(idle_last_mm, cpu));
+ per_cpu(idle_last_mm, cpu) = NULL;
+ }
break;
#endif
}
@@ -6966,7 +7310,8 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+ int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
+ sched_rcu_preempt_depth();
return (nested == preempt_offset);
}
@@ -6976,7 +7321,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6994,6 +7340,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cfe2f268afaa..760e149fdcd3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -655,37 +655,45 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
__vtime_account_system(tsk);
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
__vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
void vtime_user_enter(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
void vtime_guest_enter(struct task_struct *tsk)
@@ -697,19 +705,23 @@ void vtime_guest_enter(struct task_struct *tsk)
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ raw_spin_lock(&tsk->vtime_lock);
+ write_seqcount_begin(&tsk->vtime_seq);
__vtime_account_system(tsk);
current->flags &= ~PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seq);
+ raw_spin_unlock(&tsk->vtime_lock);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -722,24 +734,30 @@ void vtime_account_idle(struct task_struct *tsk)
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqlock(&prev->vtime_seqlock);
+ raw_spin_lock(&prev->vtime_lock);
+ write_seqcount_begin(&prev->vtime_seq);
prev->vtime_snap_whence = VTIME_SLEEPING;
- write_sequnlock(&prev->vtime_seqlock);
+ write_seqcount_end(&prev->vtime_seq);
+ raw_spin_unlock(&prev->vtime_lock);
- write_seqlock(&current->vtime_seqlock);
+ raw_spin_lock(&current->vtime_lock);
+ write_seqcount_begin(&current->vtime_seq);
current->vtime_snap_whence = VTIME_SYS;
current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_sequnlock(&current->vtime_seqlock);
+ write_seqcount_end(&current->vtime_seq);
+ raw_spin_unlock(&current->vtime_lock);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
- write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ raw_spin_lock_irqsave(&t->vtime_lock, flags);
+ write_seqcount_begin(&t->vtime_seq);
t->vtime_snap_whence = VTIME_SYS;
t->vtime_snap = sched_clock_cpu(cpu);
- write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+ write_seqcount_end(&t->vtime_seq);
+ raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
}
cputime_t task_gtime(struct task_struct *t)
@@ -748,13 +766,13 @@ cputime_t task_gtime(struct task_struct *t)
cputime_t gtime;
do {
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seq);
gtime = t->gtime;
if (t->flags & PF_VCPU)
gtime += vtime_delta(t);
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seq, seq));
return gtime;
}
@@ -777,7 +795,7 @@ fetch_task_cputime(struct task_struct *t,
*udelta = 0;
*sdelta = 0;
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seq);
if (u_dst)
*u_dst = *u_src;
@@ -801,7 +819,7 @@ fetch_task_cputime(struct task_struct *t,
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seq, seq));
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 183e8e5c38ba..8892e57abd03 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -260,6 +260,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
+#ifdef CONFIG_SMP
+ P(rt_nr_migratory);
+#endif
#undef PN
#undef P
@@ -639,6 +642,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif
P(policy);
P(prio);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ P(migrate_disable);
+#endif
+ P(nr_cpus_allowed);
#undef PN
#undef __PN
#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b4c4f320130..ffe5ac103efa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2679,7 +2679,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_task_lazy(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -2703,7 +2703,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_task(rq_of(cfs_rq)->curr);
+ resched_task_lazy(rq_of(cfs_rq)->curr);
}
static void
@@ -2824,7 +2824,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_task_lazy(rq_of(cfs_rq)->curr);
return;
}
/*
@@ -3013,7 +3013,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ resched_task_lazy(rq_of(cfs_rq)->curr);
}
static __always_inline
@@ -3612,7 +3612,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (rq->curr == p)
- resched_task(p);
+ resched_task_lazy(p);
return;
}
@@ -4477,7 +4477,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_task(curr);
+ resched_task_lazy(curr);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -6965,7 +6965,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_task(rq->curr);
+ resched_task_lazy(rq->curr);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -6990,7 +6990,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (rq->curr == p) {
if (p->prio > oldprio)
- resched_task(rq->curr);
+ resched_task_lazy(rq->curr);
} else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..ef66ab95271c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,11 +50,18 @@ SCHED_FEAT(LB_BIAS, true)
*/
SCHED_FEAT(NONTASK_POWER, true)
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#else
+SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
+#endif
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 27b8e836307f..14bded52e3d4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -43,6 +43,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.irqsafe = 1;
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f964add50f38..e97603dde6f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1047,6 +1047,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1200,6 +1201,15 @@ extern void init_sched_dl_class(void);
extern void resched_task(struct task_struct *p);
extern void resched_cpu(int cpu);
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_task_lazy(struct task_struct *tsk);
+#else
+static inline void resched_task_lazy(struct task_struct *tsk)
+{
+ resched_task(tsk);
+}
+#endif
+
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
diff --git a/kernel/sched/wait-simple.c b/kernel/sched/wait-simple.c
new file mode 100644
index 000000000000..7dfa86d1f654
--- /dev/null
+++ b/kernel/sched/wait-simple.c
@@ -0,0 +1,115 @@
+/*
+ * Simple waitqueues without fancy flags and callbacks
+ *
+ * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Based on kernel/wait.c
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/wait-simple.h>
+
+/* Adds w to head->list. Must be called with head->lock locked. */
+static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
+{
+ list_add(&w->node, &head->list);
+ /* We can't let the condition leak before the setting of head */
+ smp_mb();
+}
+
+/* Removes w from head->list. Must be called with head->lock locked. */
+static inline void __swait_dequeue(struct swaiter *w)
+{
+ list_del_init(&w->node);
+}
+
+void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
+{
+ raw_spin_lock_init(&head->lock);
+ lockdep_set_class(&head->lock, key);
+ INIT_LIST_HEAD(&head->list);
+}
+EXPORT_SYMBOL(__init_swait_head);
+
+void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
+{
+ w->task = current;
+ if (list_empty(&w->node))
+ __swait_enqueue(head, w);
+}
+
+void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&head->lock, flags);
+ swait_prepare_locked(head, w);
+ __set_current_state(state);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
+}
+EXPORT_SYMBOL(swait_prepare);
+
+void swait_finish_locked(struct swait_head *head, struct swaiter *w)
+{
+ __set_current_state(TASK_RUNNING);
+ if (w->task)
+ __swait_dequeue(w);
+}
+
+void swait_finish(struct swait_head *head, struct swaiter *w)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ if (w->task) {
+ raw_spin_lock_irqsave(&head->lock, flags);
+ __swait_dequeue(w);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
+ }
+}
+EXPORT_SYMBOL(swait_finish);
+
+unsigned int
+__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
+{
+ struct swaiter *curr, *next;
+ int woken = 0;
+
+ list_for_each_entry_safe(curr, next, &head->list, node) {
+ if (wake_up_state(curr->task, state)) {
+ __swait_dequeue(curr);
+ /*
+ * The waiting task can free the waiter as
+ * soon as curr->task = NULL is written,
+ * without taking any locks. A memory barrier
+ * is required here to prevent the following
+ * store to curr->task from getting ahead of
+ * the dequeue operation.
+ */
+ smp_wmb();
+ curr->task = NULL;
+ if (++woken == num)
+ break;
+ }
+ }
+ return woken;
+}
+
+unsigned int
+__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
+{
+ unsigned long flags;
+ int woken;
+
+ if (!swaitqueue_active(head))
+ return 0;
+
+ raw_spin_lock_irqsave(&head->lock, flags);
+ woken = __swait_wake_locked(head, state, num);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
+ return woken;
+}
+EXPORT_SYMBOL(__swait_wake);
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881db1ca0..ea098c6535b8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -14,6 +14,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -349,13 +350,45 @@ static bool task_participate_group_stop(struct task_struct *task)
return false;
}
+#ifdef __HAVE_ARCH_CMPXCHG
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+ struct sigqueue *q = t->sigqueue_cache;
+
+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+ return NULL;
+ return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+ return 0;
+ return 1;
+}
+
+#else
+
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+ return NULL;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+ return 1;
+}
+
+#endif
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+ int override_rlimit, int fromslab)
{
struct sigqueue *q = NULL;
struct user_struct *user;
@@ -372,7 +405,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
if (override_rlimit ||
atomic_read(&user->sigpending) <=
task_rlimit(t, RLIMIT_SIGPENDING)) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
+ if (!fromslab)
+ q = get_task_cache(t);
+ if (!q)
+ q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
@@ -389,6 +425,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
return q;
}
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+ int override_rlimit)
+{
+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
@@ -398,6 +441,21 @@ static void __sigqueue_free(struct sigqueue *q)
kmem_cache_free(sigqueue_cachep, q);
}
+static void sigqueue_free_current(struct sigqueue *q)
+{
+ struct user_struct *up;
+
+ if (q->flags & SIGQUEUE_PREALLOC)
+ return;
+
+ up = q->user;
+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+ atomic_dec(&up->sigpending);
+ free_uid(up);
+ } else
+ __sigqueue_free(q);
+}
+
void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q;
@@ -411,6 +469,21 @@ void flush_sigqueue(struct sigpending *queue)
}
/*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+ struct sigqueue *q;
+
+ flush_sigqueue(&tsk->pending);
+
+ q = get_task_cache(tsk);
+ if (q)
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+/*
* Flush all pending signals for a task.
*/
void __flush_signals(struct task_struct *t)
@@ -562,7 +635,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
- __sigqueue_free(first);
+ sigqueue_free_current(first);
} else {
/*
* Ok, it wasn't in the queue. This must be
@@ -608,6 +681,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
int signr;
+ WARN_ON_ONCE(tsk != current);
+
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
@@ -1230,8 +1305,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
-int
-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1256,6 +1331,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
return ret;
}
+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (in_atomic()) {
+ if (WARN_ON_ONCE(t != current))
+ return 0;
+ if (WARN_ON_ONCE(t->forced_info.si_signo))
+ return 0;
+
+ if (is_si_special(info)) {
+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
+ t->forced_info.si_signo = sig;
+ t->forced_info.si_errno = 0;
+ t->forced_info.si_code = SI_KERNEL;
+ t->forced_info.si_pid = 0;
+ t->forced_info.si_uid = 0;
+ } else {
+ t->forced_info = *info;
+ }
+
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ return 0;
+ }
+#endif
+ return do_force_sig_info(sig, info, t);
+}
+
/*
* Nuke all other threads in the group.
*/
@@ -1286,12 +1394,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
struct sighand_struct *sighand;
for (;;) {
- local_irq_save(*flags);
+ local_irq_save_nort(*flags);
rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
if (unlikely(sighand == NULL)) {
rcu_read_unlock();
- local_irq_restore(*flags);
+ local_irq_restore_nort(*flags);
break;
}
@@ -1302,7 +1410,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
}
spin_unlock(&sighand->siglock);
rcu_read_unlock();
- local_irq_restore(*flags);
+ local_irq_restore_nort(*flags);
}
return sighand;
@@ -1547,7 +1655,8 @@ EXPORT_SYMBOL(kill_pid);
*/
struct sigqueue *sigqueue_alloc(void)
{
- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+ /* Preallocated sigqueue objects always from the slabcache ! */
+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
if (q)
q->flags |= SIGQUEUE_PREALLOC;
@@ -1908,15 +2017,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
read_unlock(&tasklist_lock);
- preempt_enable_no_resched();
freezable_schedule();
} else {
/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 490fcbb1dc5b..ea86b08191bc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,10 +21,12 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <linux/tick.h>
+#include <linux/locallock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -61,6 +63,98 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
+#ifdef CONFIG_NO_HZ_COMMON
+# ifdef CONFIG_PREEMPT_RT_FULL
+
+struct softirq_runner {
+ struct task_struct *runner[NR_SOFTIRQS];
+};
+
+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
+
+static inline void softirq_set_runner(unsigned int sirq)
+{
+ struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+ sr->runner[sirq] = current;
+}
+
+static inline void softirq_clr_runner(unsigned int sirq)
+{
+ struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+
+ sr->runner[sirq] = NULL;
+}
+
+/*
+ * On preempt-rt a softirq running context might be blocked on a
+ * lock. There might be no other runnable task on this CPU because the
+ * lock owner runs on some other CPU. So we have to go into idle with
+ * the pending bit set. Therefor we need to check this otherwise we
+ * warn about false positives which confuses users and defeats the
+ * whole purpose of this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+ static int rate_limit;
+ struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
+ u32 warnpending;
+ int i;
+
+ if (rate_limit >= 10)
+ return;
+
+ warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ struct task_struct *tsk = sr->runner[i];
+
+ /*
+ * The wakeup code in rtmutex.c wakes up the task
+ * _before_ it sets pi_blocked_on to NULL under
+ * tsk->pi_lock. So we need to check for both: state
+ * and pi_blocked_on.
+ */
+ if (tsk) {
+ raw_spin_lock(&tsk->pi_lock);
+ if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
+ /* Clear all bits pending in that task */
+ warnpending &= ~(tsk->softirqs_raised);
+ warnpending &= ~(1 << i);
+ }
+ raw_spin_unlock(&tsk->pi_lock);
+ }
+ }
+
+ if (warnpending) {
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ warnpending);
+ rate_limit++;
+ }
+}
+# else
+/*
+ * On !PREEMPT_RT we just printk rate limited:
+ */
+void softirq_check_pending_idle(void)
+{
+ static int rate_limit;
+
+ if (rate_limit < 10 &&
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ local_softirq_pending());
+ rate_limit++;
+ }
+}
+# endif
+
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void softirq_set_runner(unsigned int sirq) { }
+static inline void softirq_clr_runner(unsigned int sirq) { }
+#endif
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
@@ -76,6 +170,68 @@ static void wakeup_softirqd(void)
wake_up_process(tsk);
}
+static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs)
+{
+ struct softirq_action *h = softirq_vec + vec_nr;
+ unsigned int prev_count = preempt_count();
+
+ kstat_incr_softirqs_this_cpu(vec_nr);
+
+ trace_softirq_entry(vec_nr);
+ h->action(h);
+ trace_softirq_exit(vec_nr);
+ if (unlikely(prev_count != preempt_count())) {
+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
+ vec_nr, softirq_to_name[vec_nr], h->action,
+ prev_count, preempt_count());
+ preempt_count_set(prev_count);
+ }
+ if (need_rcu_bh_qs)
+ rcu_bh_qs(cpu);
+}
+
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline int ksoftirqd_softirq_pending(void)
+{
+ return local_softirq_pending();
+}
+
+static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs)
+{
+ struct softirq_action *h = softirq_vec;
+ int softirq_bit;
+
+ local_irq_enable();
+ h = softirq_vec;
+
+ while ((softirq_bit = ffs(pending))) {
+ unsigned int vec_nr;
+
+ h += softirq_bit - 1;
+ vec_nr = h - softirq_vec;
+
+ handle_softirq(vec_nr, cpu, need_rcu_bh_qs);
+
+ h++;
+ pending >>= softirq_bit;
+ }
+
+ local_irq_disable();
+}
+
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (ksoftirqd_softirq_pending()) {
+ __do_softirq();
+ rcu_note_context_switch(cpu);
+ local_irq_enable();
+ cond_resched();
+ return;
+ }
+ local_irq_enable();
+}
+
/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -227,10 +383,8 @@ asmlinkage void __do_softirq(void)
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
- struct softirq_action *h;
bool in_hardirq;
__u32 pending;
- int softirq_bit;
int cpu;
/*
@@ -251,36 +405,7 @@ restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
- local_irq_enable();
-
- h = softirq_vec;
-
- while ((softirq_bit = ffs(pending))) {
- unsigned int vec_nr;
- int prev_count;
-
- h += softirq_bit - 1;
-
- vec_nr = h - softirq_vec;
- prev_count = preempt_count();
-
- kstat_incr_softirqs_this_cpu(vec_nr);
-
- trace_softirq_entry(vec_nr);
- h->action(h);
- trace_softirq_exit(vec_nr);
- if (unlikely(prev_count != preempt_count())) {
- pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
- vec_nr, softirq_to_name[vec_nr], h->action,
- prev_count, preempt_count());
- preempt_count_set(prev_count);
- }
- rcu_bh_qs(cpu);
- h++;
- pending >>= softirq_bit;
- }
-
- local_irq_disable();
+ handle_pending_softirqs(pending, cpu, 1);
pending = local_softirq_pending();
if (pending) {
@@ -317,6 +442,286 @@ asmlinkage void do_softirq(void)
}
/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+ __raise_softirq_irqoff(nr);
+
+ /*
+ * If we're in an interrupt or softirq, we're done
+ * (this also catches softirq-disabled code). We will
+ * actually run the softirq once we return from
+ * the irq or softirq.
+ *
+ * Otherwise we wake up ksoftirqd to make sure we
+ * schedule the softirq soon.
+ */
+ if (!in_interrupt())
+ wakeup_softirqd();
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ or_softirq_pending(1UL << nr);
+}
+
+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
+static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On RT we serialize softirq execution with a cpu local lock per softirq
+ */
+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
+
+void __init softirq_early_init(void)
+{
+ int i;
+
+ for (i = 0; i < NR_SOFTIRQS; i++)
+ local_irq_lock_init(local_softirq_locks[i]);
+}
+
+static void lock_softirq(int which)
+{
+ local_lock(local_softirq_locks[which]);
+}
+
+static void unlock_softirq(int which)
+{
+ local_unlock(local_softirq_locks[which]);
+}
+
+static void do_single_softirq(int which, int need_rcu_bh_qs)
+{
+ unsigned long old_flags = current->flags;
+
+ current->flags &= ~PF_MEMALLOC;
+ vtime_account_irq_enter(current);
+ current->flags |= PF_IN_SOFTIRQ;
+ lockdep_softirq_enter();
+ local_irq_enable();
+ handle_softirq(which, smp_processor_id(), need_rcu_bh_qs);
+ local_irq_disable();
+ lockdep_softirq_exit();
+ current->flags &= ~PF_IN_SOFTIRQ;
+ vtime_account_irq_enter(current);
+ tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+}
+
+/*
+ * Called with interrupts disabled. Process softirqs which were raised
+ * in current context (or on behalf of ksoftirqd).
+ */
+static void do_current_softirqs(int need_rcu_bh_qs)
+{
+ while (current->softirqs_raised) {
+ int i = __ffs(current->softirqs_raised);
+ unsigned int pending, mask = (1U << i);
+
+ current->softirqs_raised &= ~mask;
+ local_irq_enable();
+
+ /*
+ * If the lock is contended, we boost the owner to
+ * process the softirq or leave the critical section
+ * now.
+ */
+ lock_softirq(i);
+ local_irq_disable();
+ softirq_set_runner(i);
+ /*
+ * Check with the local_softirq_pending() bits,
+ * whether we need to process this still or if someone
+ * else took care of it.
+ */
+ pending = local_softirq_pending();
+ if (pending & mask) {
+ set_softirq_pending(pending & ~mask);
+ do_single_softirq(i, need_rcu_bh_qs);
+ }
+ softirq_clr_runner(i);
+ unlock_softirq(i);
+ WARN_ON(current->softirq_nestcnt != 1);
+ }
+}
+
+static void __local_bh_disable(void)
+{
+ if (++current->softirq_nestcnt == 1)
+ migrate_disable();
+}
+
+void local_bh_disable(void)
+{
+ __local_bh_disable();
+}
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ __local_bh_disable();
+ if (cnt & PREEMPT_CHECK_OFFSET)
+ preempt_disable();
+}
+
+static void __local_bh_enable(void)
+{
+ if (WARN_ON(current->softirq_nestcnt == 0))
+ return;
+
+ local_irq_disable();
+ if (current->softirq_nestcnt == 1 && current->softirqs_raised)
+ do_current_softirqs(1);
+ local_irq_enable();
+
+ if (--current->softirq_nestcnt == 0)
+ migrate_enable();
+}
+
+void local_bh_enable(void)
+{
+ __local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ __local_bh_enable();
+ if (cnt & PREEMPT_CHECK_OFFSET)
+ preempt_enable();
+}
+
+void local_bh_enable_ip(unsigned long ip)
+{
+ local_bh_enable();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+void _local_bh_enable(void)
+{
+ if (WARN_ON(current->softirq_nestcnt == 0))
+ return;
+ if (--current->softirq_nestcnt == 0)
+ migrate_enable();
+}
+EXPORT_SYMBOL(_local_bh_enable);
+
+int in_serving_softirq(void)
+{
+ return current->flags & PF_IN_SOFTIRQ;
+}
+EXPORT_SYMBOL(in_serving_softirq);
+
+/* Called with preemption disabled */
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ current->softirq_nestcnt++;
+
+ do_current_softirqs(1);
+ current->softirq_nestcnt--;
+ rcu_note_context_switch(cpu);
+ local_irq_enable();
+}
+
+/*
+ * Called from netif_rx_ni(). Preemption enabled, but migration
+ * disabled. So the cpu can't go away under us.
+ */
+void thread_do_softirq(void)
+{
+ if (!in_serving_softirq() && current->softirqs_raised) {
+ current->softirq_nestcnt++;
+ do_current_softirqs(0);
+ current->softirq_nestcnt--;
+ }
+}
+
+static void do_raise_softirq_irqoff(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ or_softirq_pending(1UL << nr);
+ /*
+ * If we are not in a hard interrupt and inside a bh disabled
+ * region, we simply raise the flag on current. local_bh_enable()
+ * will make sure that the softirq is executed. Otherwise we
+ * delegate it to ksoftirqd.
+ */
+
+ if (!in_irq() && current->softirq_nestcnt)
+ current->softirqs_raised |= (1U << nr);
+ else if (__this_cpu_read(ksoftirqd))
+ __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+ do_raise_softirq_irqoff(nr);
+ if (!in_irq() && !current->softirq_nestcnt)
+ wakeup_softirqd();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+void raise_softirq_irqoff(unsigned int nr)
+{
+ do_raise_softirq_irqoff(nr);
+
+ /*
+ * If we're in an hard interrupt we let irq return code deal
+ * with the wakeup of ksoftirqd.
+ */
+ if (in_irq())
+ return;
+ /*
+ * If we are in thread context but outside of a bh disabled
+ * region, we need to wake ksoftirqd as well.
+ *
+ * CHECKME: Some of the places which do that could be wrapped
+ * into local_bh_disable/enable pairs. Though it's unclear
+ * whether this is worth the effort. To find those places just
+ * raise a WARN() if the condition is met.
+ *
+ */
+ if (!current->softirq_nestcnt)
+ wakeup_softirqd();
+}
+
+static inline int ksoftirqd_softirq_pending(void)
+{
+ return current->softirqs_raised;
+}
+
+static inline void local_bh_disable_nort(void) { }
+static inline void _local_bh_enable_nort(void) { }
+
+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
+{
+ struct sched_param param = { .sched_priority = 1 };
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+ /* Take over all pending softirqs when starting */
+ local_irq_disable();
+ current->softirqs_raised = local_softirq_pending();
+ local_irq_enable();
+}
+
+static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
+{
+ struct sched_param param = { .sched_priority = 0 };
+
+ sched_setscheduler(current, SCHED_NORMAL, &param);
+}
+
+#endif /* PREEMPT_RT_FULL */
+/*
* Enter an interrupt context.
*/
void irq_enter(void)
@@ -327,9 +732,9 @@ void irq_enter(void)
* Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt.
*/
- local_bh_disable();
+ local_bh_disable_nort();
tick_irq_enter();
- _local_bh_enable();
+ _local_bh_enable_nort();
}
__irq_enter();
@@ -337,6 +742,7 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
+#ifndef CONFIG_PREEMPT_RT_FULL
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -356,6 +762,15 @@ static inline void invoke_softirq(void)
} else {
wakeup_softirqd();
}
+#else /* PREEMPT_RT_FULL */
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(ksoftirqd) &&
+ __this_cpu_read(ksoftirqd)->softirqs_raised)
+ wakeup_softirqd();
+ local_irq_restore(flags);
+#endif
}
static inline void tick_irq_exit(void)
@@ -392,26 +807,6 @@ void irq_exit(void)
trace_hardirq_exit(); /* must be last! */
}
-/*
- * This function must run with irqs disabled!
- */
-inline void raise_softirq_irqoff(unsigned int nr)
-{
- __raise_softirq_irqoff(nr);
-
- /*
- * If we're in an interrupt or softirq, we're done
- * (this also catches softirq-disabled code). We will
- * actually run the softirq once we return from
- * the irq or softirq.
- *
- * Otherwise we wake up ksoftirqd to make sure we
- * schedule the softirq soon.
- */
- if (!in_interrupt())
- wakeup_softirqd();
-}
-
void raise_softirq(unsigned int nr)
{
unsigned long flags;
@@ -421,12 +816,6 @@ void raise_softirq(unsigned int nr)
local_irq_restore(flags);
}
-void __raise_softirq_irqoff(unsigned int nr)
-{
- trace_softirq_raise(nr);
- or_softirq_pending(1UL << nr);
-}
-
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
@@ -443,15 +832,45 @@ struct tasklet_head {
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+ if (tasklet_trylock(t)) {
+again:
+ /* We may have been preempted before tasklet_trylock
+ * and __tasklet_action may have already run.
+ * So double check the sched bit while the takslet
+ * is locked before adding it to the list.
+ */
+ if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+ t->next = NULL;
+ *head->tail = t;
+ head->tail = &(t->next);
+ raise_softirq_irqoff(nr);
+ tasklet_unlock(t);
+ } else {
+ /* This is subtle. If we hit the corner case above
+ * It is possible that we get preempted right here,
+ * and another task has successfully called
+ * tasklet_schedule(), then this function, and
+ * failed on the trylock. Thus we must be sure
+ * before releasing the tasklet lock, that the
+ * SCHED_BIT is clear. Otherwise the tasklet
+ * may get its SCHED_BIT set, but not added to the
+ * list
+ */
+ if (!tasklet_tryunlock(t))
+ goto again;
+ }
+ }
+}
+
void __tasklet_schedule(struct tasklet_struct *t)
{
unsigned long flags;
local_irq_save(flags);
- t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_schedule);
@@ -461,60 +880,123 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
unsigned long flags;
local_irq_save(flags);
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- raise_softirq_irqoff(HI_SOFTIRQ);
+ __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
- BUG_ON(!irqs_disabled());
-
- t->next = __this_cpu_read(tasklet_hi_vec.head);
- __this_cpu_write(tasklet_hi_vec.head, t);
- __raise_softirq_irqoff(HI_SOFTIRQ);
+ __tasklet_hi_schedule(t);
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-static void tasklet_action(struct softirq_action *a)
+void tasklet_enable(struct tasklet_struct *t)
{
- struct tasklet_struct *list;
+ if (!atomic_dec_and_test(&t->count))
+ return;
+ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+ tasklet_schedule(t);
+}
+EXPORT_SYMBOL(tasklet_enable);
- local_irq_disable();
- list = __this_cpu_read(tasklet_vec.head);
- __this_cpu_write(tasklet_vec.head, NULL);
- __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
- local_irq_enable();
+void tasklet_hi_enable(struct tasklet_struct *t)
+{
+ if (!atomic_dec_and_test(&t->count))
+ return;
+ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+ tasklet_hi_schedule(t);
+}
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+ int loops = 1000000;
while (list) {
struct tasklet_struct *t = list;
list = list->next;
- if (tasklet_trylock(t)) {
- if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- t->func(t->data);
- tasklet_unlock(t);
- continue;
- }
- tasklet_unlock(t);
+ /*
+ * Should always succeed - after a tasklist got on the
+ * list (after getting the SCHED bit set from 0 to 1),
+ * nothing but the tasklet softirq it got queued to can
+ * lock it:
+ */
+ if (!tasklet_trylock(t)) {
+ WARN_ON(1);
+ continue;
}
- local_irq_disable();
t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
- local_irq_enable();
+
+ /*
+ * If we cannot handle the tasklet because it's disabled,
+ * mark it as pending. tasklet_enable() will later
+ * re-schedule the tasklet.
+ */
+ if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+ /* implicit unlock: */
+ wmb();
+ t->state = TASKLET_STATEF_PENDING;
+ continue;
+ }
+
+ /*
+ * After this point on the tasklet might be rescheduled
+ * on another CPU, but it can only be added to another
+ * CPU's tasklet list if we unlock the tasklet (which we
+ * dont do yet).
+ */
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ WARN_ON(1);
+
+again:
+ t->func(t->data);
+
+ /*
+ * Try to unlock the tasklet. We must use cmpxchg, because
+ * another CPU might have scheduled or disabled the tasklet.
+ * We only allow the STATE_RUN -> 0 transition here.
+ */
+ while (!tasklet_tryunlock(t)) {
+ /*
+ * If it got disabled meanwhile, bail out:
+ */
+ if (atomic_read(&t->count))
+ goto out_disabled;
+ /*
+ * If it got scheduled meanwhile, re-execute
+ * the tasklet function:
+ */
+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ goto again;
+ if (!--loops) {
+ printk("hm, tasklet state: %08lx\n", t->state);
+ WARN_ON(1);
+ tasklet_unlock(t);
+ break;
+ }
+ }
}
}
+static void tasklet_action(struct softirq_action *a)
+{
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = __get_cpu_var(tasklet_vec).head;
+ __get_cpu_var(tasklet_vec).head = NULL;
+ __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+ local_irq_enable();
+
+ __tasklet_action(a, list);
+}
+
static void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -525,30 +1007,7 @@ static void tasklet_hi_action(struct softirq_action *a)
__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
local_irq_enable();
- while (list) {
- struct tasklet_struct *t = list;
-
- list = list->next;
-
- if (tasklet_trylock(t)) {
- if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- t->func(t->data);
- tasklet_unlock(t);
- continue;
- }
- tasklet_unlock(t);
- }
-
- local_irq_disable();
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- __raise_softirq_irqoff(HI_SOFTIRQ);
- local_irq_enable();
- }
+ __tasklet_action(a, list);
}
void tasklet_init(struct tasklet_struct *t,
@@ -569,7 +1028,7 @@ void tasklet_kill(struct tasklet_struct *t)
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
do {
- yield();
+ msleep(1);
} while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
@@ -643,26 +1102,26 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
-static int ksoftirqd_should_run(unsigned int cpu)
-{
- return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
+void tasklet_unlock_wait(struct tasklet_struct *t)
{
- local_irq_disable();
- if (local_softirq_pending()) {
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
/*
- * We can safely run softirq on inline stack, as we are not deep
- * in the task stack here.
+ * Hack for now to avoid this busy-loop:
*/
- __do_softirq();
- rcu_note_context_switch(cpu);
- local_irq_enable();
- cond_resched();
- return;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ msleep(1);
+#else
+ barrier();
+#endif
}
- local_irq_enable();
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+#endif
+
+static int ksoftirqd_should_run(unsigned int cpu)
+{
+ return ksoftirqd_softirq_pending();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -744,6 +1203,8 @@ static struct notifier_block cpu_nfb = {
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
+ .setup = ksoftirqd_set_sched_params,
+ .cleanup = ksoftirqd_clr_sched_params,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..bcbae9c962a9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -30,12 +30,12 @@ struct cpu_stop_done {
atomic_t nr_todo; /* nr left to execute */
bool executed; /* actually executed? */
int ret; /* collected return value */
- struct completion completion; /* fired if nr_todo reaches 0 */
+ struct task_struct *waiter; /* woken when nr_todo reaches 0 */
};
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
- spinlock_t lock;
+ raw_spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
};
@@ -56,7 +56,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
atomic_set(&done->nr_todo, nr_todo);
- init_completion(&done->completion);
+ done->waiter = current;
}
/* signal completion unless @done is NULL */
@@ -65,8 +65,10 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
if (done) {
if (executed)
done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo))
- complete(&done->completion);
+ if (atomic_dec_and_test(&done->nr_todo)) {
+ wake_up_process(done->waiter);
+ done->waiter = NULL;
+ }
}
}
@@ -78,7 +80,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
unsigned long flags;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
@@ -86,7 +88,23 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
} else
cpu_stop_signal_done(work->done, false);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void wait_for_stop_done(struct cpu_stop_done *done)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (atomic_read(&done->nr_todo)) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ /*
+ * We need to wait until cpu_stop_signal_done() has cleared
+ * done->waiter.
+ */
+ while (done->waiter)
+ cpu_relax();
+ set_current_state(TASK_RUNNING);
}
/**
@@ -120,7 +138,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
cpu_stop_init_done(&done, 1);
cpu_stop_queue_work(cpu, &work);
- wait_for_completion(&done.completion);
+ wait_for_stop_done(&done);
return done.executed ? done.ret : -ENOENT;
}
@@ -248,7 +266,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
- preempt_disable();
+ preempt_disable_nort();
msdata = (struct multi_stop_data){
.fn = fn,
.data = arg,
@@ -281,7 +299,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
* This relies on the stopper workqueues to be FIFO.
*/
if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
- preempt_enable();
+ preempt_enable_nort();
return -ENOENT;
}
@@ -295,9 +313,9 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
&irq_cpu_stop_queue_work,
&call_args, 1);
lg_local_unlock(&stop_cpus_lock);
- preempt_enable();
+ preempt_enable_nort();
- wait_for_completion(&done.completion);
+ wait_for_stop_done(&done);
return done.executed ? done.ret : -ENOENT;
}
@@ -328,7 +346,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
- struct cpu_stop_done *done)
+ struct cpu_stop_done *done, bool inactive)
{
struct cpu_stop_work *work;
unsigned int cpu;
@@ -342,11 +360,13 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
}
/*
- * Disable preemption while queueing to avoid getting
- * preempted by a stopper which might wait for other stoppers
- * to enter @fn which can lead to deadlock.
+ * Make sure that all work is queued on all cpus before
+ * any of the cpus can execute it.
*/
- lg_global_lock(&stop_cpus_lock);
+ if (!inactive)
+ lg_global_lock(&stop_cpus_lock);
+ else
+ lg_global_trylock_relax(&stop_cpus_lock);
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
lg_global_unlock(&stop_cpus_lock);
@@ -358,8 +378,8 @@ static int __stop_cpus(const struct cpumask *cpumask,
struct cpu_stop_done done;
cpu_stop_init_done(&done, cpumask_weight(cpumask));
- queue_stop_cpus_work(cpumask, fn, arg, &done);
- wait_for_completion(&done.completion);
+ queue_stop_cpus_work(cpumask, fn, arg, &done, false);
+ wait_for_stop_done(&done);
return done.executed ? done.ret : -ENOENT;
}
@@ -438,9 +458,9 @@ static int cpu_stop_should_run(unsigned int cpu)
unsigned long flags;
int run;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
run = !list_empty(&stopper->works);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
return run;
}
@@ -452,13 +472,13 @@ static void cpu_stopper_thread(unsigned int cpu)
repeat:
work = NULL;
- spin_lock_irq(&stopper->lock);
+ raw_spin_lock_irq(&stopper->lock);
if (!list_empty(&stopper->works)) {
work = list_first_entry(&stopper->works,
struct cpu_stop_work, list);
list_del_init(&work->list);
}
- spin_unlock_irq(&stopper->lock);
+ raw_spin_unlock_irq(&stopper->lock);
if (work) {
cpu_stop_fn_t fn = work->fn;
@@ -466,6 +486,16 @@ repeat:
struct cpu_stop_done *done = work->done;
char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
+ /*
+ * Wait until the stopper finished scheduling on all
+ * cpus
+ */
+ lg_global_lock(&stop_cpus_lock);
+ /*
+ * Let other cpu threads continue as well
+ */
+ lg_global_unlock(&stop_cpus_lock);
+
/* cpu stop callbacks are not allowed to sleep */
preempt_disable();
@@ -480,7 +510,13 @@ repeat:
kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
ksym_buf), arg);
+ /*
+ * Make sure that the wakeup and setting done->waiter
+ * to NULL is atomic.
+ */
+ local_irq_disable();
cpu_stop_signal_done(done, true);
+ local_irq_enable();
goto repeat;
}
}
@@ -499,20 +535,20 @@ static void cpu_stop_park(unsigned int cpu)
unsigned long flags;
/* drain remaining works */
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
list_for_each_entry(work, &stopper->works, list)
cpu_stop_signal_done(work->done, false);
stopper->enabled = false;
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
}
static void cpu_stop_unpark(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- spin_lock_irq(&stopper->lock);
+ raw_spin_lock_irq(&stopper->lock);
stopper->enabled = true;
- spin_unlock_irq(&stopper->lock);
+ raw_spin_unlock_irq(&stopper->lock);
}
static struct smp_hotplug_thread cpu_stop_threads = {
@@ -534,10 +570,12 @@ static int __init cpu_stop_init(void)
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- spin_lock_init(&stopper->lock);
+ raw_spin_lock_init(&stopper->lock);
INIT_LIST_HEAD(&stopper->works);
}
+ lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
+
BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
stop_machine_initialized = true;
return 0;
@@ -633,11 +671,11 @@ int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
set_state(&msdata, MULTI_STOP_PREPARE);
cpu_stop_init_done(&done, num_active_cpus());
queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
- &done);
+ &done, true);
ret = multi_cpu_stop(&msdata);
/* Busy wait for completion. */
- while (!completion_done(&done.completion))
+ while (atomic_read(&done.nr_todo))
cpu_relax();
mutex_unlock(&stop_cpus_mutex);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..23d7203cc8af 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -73,7 +73,8 @@ static struct clocksource clocksource_jiffies = {
.shift = JIFFIES_SHIFT,
};
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
@@ -82,9 +83,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
ret = jiffies_64;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af8d1d4f3d55..d6132cde9ba3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,6 +10,7 @@
#include <linux/workqueue.h>
#include <linux/hrtimer.h>
#include <linux/jiffies.h>
+#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/timex.h>
#include <linux/time.h>
@@ -517,10 +518,49 @@ static void sync_cmos_clock(struct work_struct *work)
schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT can not call schedule_delayed_work from real interrupt context.
+ * Need to make a thread to do the real work.
+ */
+static struct task_struct *cmos_delay_thread;
+static bool do_cmos_delay;
+
+static int run_cmos_delay(void *ignore)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (do_cmos_delay) {
+ do_cmos_delay = false;
+ schedule_delayed_work(&sync_cmos_work, 0);
+ }
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+void ntp_notify_cmos_timer(void)
+{
+ do_cmos_delay = true;
+ /* Make visible before waking up process */
+ smp_wmb();
+ wake_up_process(cmos_delay_thread);
+}
+
+static __init int create_cmos_delay_thread(void)
+{
+ cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
+ BUG_ON(!cmos_delay_thread);
+ return 0;
+}
+early_initcall(create_cmos_delay_thread);
+#else
void ntp_notify_cmos_timer(void)
{
schedule_delayed_work(&sync_cmos_work, 0);
}
+#endif /* CONFIG_PREEMPT_RT_FULL */
#else
void ntp_notify_cmos_timer(void) { }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0de9d7f5045c..aaa999a98d32 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -78,13 +78,15 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -146,9 +148,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
ktime_t next;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
next = tick_next_period;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 8329669b51ec..e41ec1856dbd 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,7 +4,8 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
#define CS_NAME_LEN 32
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..c4a7d7ba8217 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
return;
/* Reevalute with jiffies_lock held */
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 >= tick_period.tv64) {
@@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return;
}
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update.tv64 == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return period;
}
@@ -221,6 +226,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
.func = nohz_full_kick_work_func,
+ .flags = IRQ_WORK_HARD_IRQ,
};
/*
@@ -540,10 +546,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
last_update = last_jiffies_update;
last_jiffies = jiffies;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
@@ -721,14 +727,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
- static int ratelimit;
-
- if (ratelimit < 10 &&
- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- pr_warn("NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
+ softirq_check_pending_idle();
return false;
}
@@ -1110,6 +1109,7 @@ void tick_setup_sched_timer(void)
* Emulate tick processing via per-CPU hrtimers:
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ts->sched_timer.irqsafe = 1;
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per cpu) */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b40279ecd71..d0e8114553ab 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1735,8 +1735,10 @@ EXPORT_SYMBOL(hardpps);
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
do_timer(ticks);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
diff --git a/kernel/timer.c b/kernel/timer.c
index 38f0d40fca13..8f1687a4b9c2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -78,6 +78,9 @@ struct tvec_root {
struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ wait_queue_head_t wait_for_running_timer;
+#endif
unsigned long timer_jiffies;
unsigned long next_timer;
unsigned long active_timers;
@@ -720,6 +723,36 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
}
+#ifndef CONFIG_PREEMPT_RT_FULL
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+ struct tvec_base *old,
+ struct tvec_base *new)
+{
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&old->lock);
+ spin_lock(&new->lock);
+ timer_set_base(timer, new);
+ return new;
+}
+#else
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+ struct tvec_base *old,
+ struct tvec_base *new)
+{
+ /*
+ * We cannot do the above because we might be preempted and
+ * then the preempter would see NULL and loop forever.
+ */
+ if (spin_trylock(&new->lock)) {
+ timer_set_base(timer, new);
+ spin_unlock(&old->lock);
+ return new;
+ }
+ return old;
+}
+#endif
+
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires,
bool pending_only, int pinned)
@@ -739,12 +772,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
+ preempt_disable_rt();
cpu = smp_processor_id();
#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
cpu = get_nohz_timer_target();
#endif
+ preempt_enable_rt();
+
new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) {
@@ -755,14 +791,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
* handler yet has not finished. This also guarantees that
* the timer is serialized wrt itself.
*/
- if (likely(base->running_timer != timer)) {
- /* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
- spin_unlock(&base->lock);
- base = new_base;
- spin_lock(&base->lock);
- timer_set_base(timer, base);
- }
+ if (likely(base->running_timer != timer))
+ base = switch_timer_base(timer, base, new_base);
}
timer->expires = expires;
@@ -945,6 +975,29 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
EXPORT_SYMBOL_GPL(add_timer_on);
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Wait for a running timer
+ */
+static void wait_for_running_timer(struct timer_list *timer)
+{
+ struct tvec_base *base = timer->base;
+
+ if (base->running_timer == timer)
+ wait_event(base->wait_for_running_timer,
+ base->running_timer != timer);
+}
+
+# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer)
+#else
+static inline void wait_for_running_timer(struct timer_list *timer)
+{
+ cpu_relax();
+}
+
+# define wakeup_timer_waiters(b) do { } while (0)
+#endif
+
/**
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
@@ -1002,7 +1055,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
}
EXPORT_SYMBOL(try_to_del_timer_sync);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1062,7 +1115,7 @@ int del_timer_sync(struct timer_list *timer)
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
return ret;
- cpu_relax();
+ wait_for_running_timer(timer);
}
}
EXPORT_SYMBOL(del_timer_sync);
@@ -1179,15 +1232,17 @@ static inline void __run_timers(struct tvec_base *base)
if (irqsafe) {
spin_unlock(&base->lock);
call_timer_fn(timer, fn, data);
+ base->running_timer = NULL;
spin_lock(&base->lock);
} else {
spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, data);
+ base->running_timer = NULL;
spin_lock_irq(&base->lock);
}
}
}
- base->running_timer = NULL;
+ wakeup_timer_waiters(base);
spin_unlock_irq(&base->lock);
}
@@ -1327,17 +1382,31 @@ unsigned long get_next_timer_interrupt(unsigned long now)
if (cpu_is_offline(smp_processor_id()))
return expires;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /*
+ * On PREEMPT_RT we cannot sleep here. If the trylock does not
+ * succeed then we return the worst-case 'expires in 1 tick'
+ * value. We use the rt functions here directly to avoid a
+ * migrate_disable() call.
+ */
+ if (!spin_do_trylock(&base->lock))
+ return now + 1;
+#else
spin_lock(&base->lock);
+#endif
if (base->active_timers) {
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
expires = base->next_timer;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+ rt_spin_unlock_after_trylock_in_irq(&base->lock);
+#else
spin_unlock(&base->lock);
+#endif
if (time_before_eq(expires, now))
return now;
-
return cmp_next_hrtimer_event(now, expires);
}
#endif
@@ -1353,13 +1422,13 @@ void update_process_times(int user_tick)
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
+ scheduler_tick();
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK)
if (in_irq())
irq_work_run();
#endif
- scheduler_tick();
run_posix_cpu_timers(p);
}
@@ -1370,7 +1439,9 @@ static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __this_cpu_read(tvec_bases);
- hrtimer_run_pending();
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
+ irq_work_run();
+#endif
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -1381,8 +1452,52 @@ static void run_timer_softirq(struct softirq_action *h)
*/
void run_local_timers(void)
{
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
+
hrtimer_run_queues();
+ /*
+ * We can access this lockless as we are in the timer
+ * interrupt. If there are no timers queued, nothing to do in
+ * the timer softirq.
+ */
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+#ifndef CONFIG_SMP
+ /*
+ * The spin_do_trylock() later may fail as the lock may be hold before
+ * the interrupt arrived. The spin-lock debugging code will raise a
+ * warning if the try_lock fails on UP. Since this is only an
+ * optimization for the FULL_NO_HZ case (not to run the timer softirq on
+ * an nohz_full CPU) we don't really care and shedule the softirq.
+ */
raise_softirq(TIMER_SOFTIRQ);
+ return;
+#endif
+
+ /* On RT, irq work runs from softirq */
+ if (irq_work_needs_cpu()) {
+ raise_softirq(TIMER_SOFTIRQ);
+ return;
+ }
+
+ if (!spin_do_trylock(&base->lock)) {
+ raise_softirq(TIMER_SOFTIRQ);
+ return;
+ }
+#endif
+
+ if (!base->active_timers)
+ goto out;
+
+ /* Check whether the next pending timer has expired */
+ if (time_before_eq(base->next_timer, jiffies))
+ raise_softirq(TIMER_SOFTIRQ);
+out:
+#ifdef CONFIG_PREEMPT_RT_FULL
+ rt_spin_unlock_after_trylock_in_irq(&base->lock);
+#endif
+ /* The ; ensures that gcc won't complain in the !RT case */
+ ;
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1546,6 +1661,9 @@ static int init_timers_cpu(int cpu)
base = per_cpu(tvec_bases, cpu);
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+ init_waitqueue_head(&base->wait_for_running_timer);
+#endif
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1584,7 +1702,7 @@ static void migrate_timers(int cpu)
BUG_ON(cpu_online(cpu));
old_base = per_cpu(tvec_bases, cpu);
- new_base = get_cpu_var(tvec_bases);
+ new_base = get_local_var(tvec_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1605,7 +1723,7 @@ static void migrate_timers(int cpu)
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
- put_cpu_var(tvec_bases);
+ put_local_var(tvec_bases);
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85aaca08..bbe95b9e0115 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -192,6 +192,24 @@ config IRQSOFF_TRACER
enabled. This option and the preempt-off timing option can be
used together or separately.)
+config INTERRUPT_OFF_HIST
+ bool "Interrupts-off Latency Histogram"
+ depends on IRQSOFF_TRACER
+ help
+ This option generates continuously updated histograms (one per cpu)
+ of the duration of time periods with interrupts disabled. The
+ histograms are disabled by default. To enable them, write a non-zero
+ number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+ If PREEMPT_OFF_HIST is also selected, additional histograms (one
+ per cpu) are generated that accumulate the duration of time periods
+ when both interrupts and preemption are disabled. The histogram data
+ will be located in the debug file system at
+
+ /sys/kernel/debug/tracing/latency_hist/irqsoff
+
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
@@ -216,6 +234,24 @@ config PREEMPT_TRACER
enabled. This option and the irqs-off timing option can be
used together or separately.)
+config PREEMPT_OFF_HIST
+ bool "Preemption-off Latency Histogram"
+ depends on PREEMPT_TRACER
+ help
+ This option generates continuously updated histograms (one per cpu)
+ of the duration of time periods with preemption disabled. The
+ histograms are disabled by default. To enable them, write a non-zero
+ number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
+
+ If INTERRUPT_OFF_HIST is also selected, additional histograms (one
+ per cpu) are generated that accumulate the duration of time periods
+ when both interrupts and preemption are disabled. The histogram data
+ will be located in the debug file system at
+
+ /sys/kernel/debug/tracing/latency_hist/preemptoff
+
config SCHED_TRACER
bool "Scheduling Latency Tracer"
select GENERIC_TRACER
@@ -226,6 +262,74 @@ config SCHED_TRACER
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
+config WAKEUP_LATENCY_HIST
+ bool "Scheduling Latency Histogram"
+ depends on SCHED_TRACER
+ help
+ This option generates continuously updated histograms (one per cpu)
+ of the scheduling latency of the highest priority task.
+ The histograms are disabled by default. To enable them, write a
+ non-zero number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/wakeup
+
+ Two different algorithms are used, one to determine the latency of
+ processes that exclusively use the highest priority of the system and
+ another one to determine the latency of processes that share the
+ highest system priority with other processes. The former is used to
+ improve hardware and system software, the latter to optimize the
+ priority design of a given system. The histogram data will be
+ located in the debug file system at
+
+ /sys/kernel/debug/tracing/latency_hist/wakeup
+
+ and
+
+ /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
+
+ If both Scheduling Latency Histogram and Missed Timer Offsets
+ Histogram are selected, additional histogram data will be collected
+ that contain, in addition to the wakeup latency, the timer latency, in
+ case the wakeup was triggered by an expired timer. These histograms
+ are available in the
+
+ /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+ directory. They reflect the apparent interrupt and scheduling latency
+ and are best suitable to determine the worst-case latency of a given
+ system. To enable these histograms, write a non-zero number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
+config MISSED_TIMER_OFFSETS_HIST
+ depends on HIGH_RES_TIMERS
+ select GENERIC_TRACER
+ bool "Missed Timer Offsets Histogram"
+ help
+ Generate a histogram of missed timer offsets in microseconds. The
+ histograms are disabled by default. To enable them, write a non-zero
+ number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
+
+ The histogram data will be located in the debug file system at
+
+ /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
+
+ If both Scheduling Latency Histogram and Missed Timer Offsets
+ Histogram are selected, additional histogram data will be collected
+ that contain, in addition to the wakeup latency, the timer latency, in
+ case the wakeup was triggered by an expired timer. These histograms
+ are available in the
+
+ /sys/kernel/debug/tracing/latency_hist/timerandwakeup
+
+ directory. They reflect the apparent interrupt and scheduling latency
+ and are best suitable to determine the worst-case latency of a given
+ system. To enable these histograms, write a non-zero number to
+
+ /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
+
config ENABLE_DEFAULT_TRACERS
bool "Trace process context switches and events"
depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..e28db4c38b2f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
+obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
+obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
new file mode 100644
index 000000000000..66a69eb5329c
--- /dev/null
+++ b/kernel/trace/latency_hist.c
@@ -0,0 +1,1178 @@
+/*
+ * kernel/trace/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ * Copyright (C) 2005 MontaVista Software, Inc.
+ * Yi Yang <yyang@ch.mvista.com>
+ *
+ * Converted to work with the new latency tracer.
+ * Copyright (C) 2008 Red Hat, Inc.
+ * Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <asm/div64.h>
+
+#include "trace.h"
+#include <trace/events/sched.h>
+
+#define NSECS_PER_USECS 1000L
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/hist.h>
+
+enum {
+ IRQSOFF_LATENCY = 0,
+ PREEMPTOFF_LATENCY,
+ PREEMPTIRQSOFF_LATENCY,
+ WAKEUP_LATENCY,
+ WAKEUP_LATENCY_SHAREDPRIO,
+ MISSED_TIMER_OFFSETS,
+ TIMERANDWAKEUP_LATENCY,
+ MAX_LATENCY_TYPE,
+};
+
+#define MAX_ENTRY_NUM 10240
+
+struct hist_data {
+ atomic_t hist_mode; /* 0 log, 1 don't log */
+ long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
+ long min_lat;
+ long max_lat;
+ unsigned long long below_hist_bound_samples;
+ unsigned long long above_hist_bound_samples;
+ long long accumulate_lat;
+ unsigned long long total_samples;
+ unsigned long long hist_array[MAX_ENTRY_NUM];
+};
+
+struct enable_data {
+ int latency_type;
+ int enabled;
+};
+
+static char *latency_hist_dir_root = "latency_hist";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
+static char *irqsoff_hist_dir = "irqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
+static DEFINE_PER_CPU(int, hist_irqsoff_counting);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
+static char *preemptoff_hist_dir = "preemptoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
+static DEFINE_PER_CPU(int, hist_preemptoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
+static char *preemptirqsoff_hist_dir = "preemptirqsoff";
+static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
+static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
+static struct enable_data preemptirqsoff_enabled_data = {
+ .latency_type = PREEMPTIRQSOFF_LATENCY,
+ .enabled = 0,
+};
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+struct maxlatproc_data {
+ char comm[FIELD_SIZEOF(struct task_struct, comm)];
+ char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
+ int pid;
+ int current_pid;
+ int prio;
+ int current_prio;
+ long latency;
+ long timeroffset;
+ cycle_t timestamp;
+};
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
+static char *wakeup_latency_hist_dir = "wakeup";
+static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
+static notrace void probe_wakeup_latency_hist_start(void *v,
+ struct task_struct *p, int success);
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+ struct task_struct *prev, struct task_struct *next);
+static notrace void probe_sched_migrate_task(void *,
+ struct task_struct *task, int cpu);
+static struct enable_data wakeup_latency_enabled_data = {
+ .latency_type = WAKEUP_LATENCY,
+ .enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
+static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
+static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
+static DEFINE_PER_CPU(int, wakeup_sharedprio);
+static unsigned long wakeup_pid;
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
+static char *missed_timer_offsets_dir = "missed_timer_offsets";
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+ long long offset, struct task_struct *curr, struct task_struct *task);
+static struct enable_data missed_timer_offsets_enabled_data = {
+ .latency_type = MISSED_TIMER_OFFSETS,
+ .enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
+static unsigned long missed_timer_offsets_pid;
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
+static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
+static struct enable_data timerandwakeup_enabled_data = {
+ .latency_type = TIMERANDWAKEUP_LATENCY,
+ .enabled = 0,
+};
+static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
+#endif
+
+void notrace latency_hist(int latency_type, int cpu, long latency,
+ long timeroffset, cycle_t stop,
+ struct task_struct *p)
+{
+ struct hist_data *my_hist;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ struct maxlatproc_data *mp = NULL;
+#endif
+
+ if (!cpu_possible(cpu) || latency_type < 0 ||
+ latency_type >= MAX_LATENCY_TYPE)
+ return;
+
+ switch (latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ case IRQSOFF_LATENCY:
+ my_hist = &per_cpu(irqsoff_hist, cpu);
+ break;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ case PREEMPTOFF_LATENCY:
+ my_hist = &per_cpu(preemptoff_hist, cpu);
+ break;
+#endif
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+ case PREEMPTIRQSOFF_LATENCY:
+ my_hist = &per_cpu(preemptirqsoff_hist, cpu);
+ break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ case WAKEUP_LATENCY:
+ my_hist = &per_cpu(wakeup_latency_hist, cpu);
+ mp = &per_cpu(wakeup_maxlatproc, cpu);
+ break;
+ case WAKEUP_LATENCY_SHAREDPRIO:
+ my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+ mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+ break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ case MISSED_TIMER_OFFSETS:
+ my_hist = &per_cpu(missed_timer_offsets, cpu);
+ mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+ break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ case TIMERANDWAKEUP_LATENCY:
+ my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+ mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+ break;
+#endif
+
+ default:
+ return;
+ }
+
+ latency += my_hist->offset;
+
+ if (atomic_read(&my_hist->hist_mode) == 0)
+ return;
+
+ if (latency < 0 || latency >= MAX_ENTRY_NUM) {
+ if (latency < 0)
+ my_hist->below_hist_bound_samples++;
+ else
+ my_hist->above_hist_bound_samples++;
+ } else
+ my_hist->hist_array[latency]++;
+
+ if (unlikely(latency > my_hist->max_lat ||
+ my_hist->min_lat == LONG_MAX)) {
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ if (latency_type == WAKEUP_LATENCY ||
+ latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+ latency_type == MISSED_TIMER_OFFSETS ||
+ latency_type == TIMERANDWAKEUP_LATENCY) {
+ strncpy(mp->comm, p->comm, sizeof(mp->comm));
+ strncpy(mp->current_comm, current->comm,
+ sizeof(mp->current_comm));
+ mp->pid = task_pid_nr(p);
+ mp->current_pid = task_pid_nr(current);
+ mp->prio = p->prio;
+ mp->current_prio = current->prio;
+ mp->latency = latency;
+ mp->timeroffset = timeroffset;
+ mp->timestamp = stop;
+ }
+#endif
+ my_hist->max_lat = latency;
+ }
+ if (unlikely(latency < my_hist->min_lat))
+ my_hist->min_lat = latency;
+ my_hist->total_samples++;
+ my_hist->accumulate_lat += latency;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t *index_ptr = NULL;
+ loff_t index = *pos;
+ struct hist_data *my_hist = m->private;
+
+ if (index == 0) {
+ char minstr[32], avgstr[32], maxstr[32];
+
+ atomic_dec(&my_hist->hist_mode);
+
+ if (likely(my_hist->total_samples)) {
+ long avg = (long) div64_s64(my_hist->accumulate_lat,
+ my_hist->total_samples);
+ snprintf(minstr, sizeof(minstr), "%ld",
+ my_hist->min_lat - my_hist->offset);
+ snprintf(avgstr, sizeof(avgstr), "%ld",
+ avg - my_hist->offset);
+ snprintf(maxstr, sizeof(maxstr), "%ld",
+ my_hist->max_lat - my_hist->offset);
+ } else {
+ strcpy(minstr, "<undef>");
+ strcpy(avgstr, minstr);
+ strcpy(maxstr, minstr);
+ }
+
+ seq_printf(m, "#Minimum latency: %s microseconds\n"
+ "#Average latency: %s microseconds\n"
+ "#Maximum latency: %s microseconds\n"
+ "#Total samples: %llu\n"
+ "#There are %llu samples lower than %ld"
+ " microseconds.\n"
+ "#There are %llu samples greater or equal"
+ " than %ld microseconds.\n"
+ "#usecs\t%16s\n",
+ minstr, avgstr, maxstr,
+ my_hist->total_samples,
+ my_hist->below_hist_bound_samples,
+ -my_hist->offset,
+ my_hist->above_hist_bound_samples,
+ MAX_ENTRY_NUM - my_hist->offset,
+ "samples");
+ }
+ if (index < MAX_ENTRY_NUM) {
+ index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+ if (index_ptr)
+ *index_ptr = index;
+ }
+
+ return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ loff_t *index_ptr = p;
+ struct hist_data *my_hist = m->private;
+
+ if (++*pos >= MAX_ENTRY_NUM) {
+ atomic_inc(&my_hist->hist_mode);
+ return NULL;
+ }
+ *index_ptr = *pos;
+ return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+ kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+ int index = *(loff_t *) p;
+ struct hist_data *my_hist = m->private;
+
+ seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
+ my_hist->hist_array[index]);
+ return 0;
+}
+
+static const struct seq_operations latency_hist_seq_op = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show
+};
+
+static int latency_hist_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &latency_hist_seq_op);
+ if (!ret) {
+ struct seq_file *seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+ return ret;
+}
+
+static const struct file_operations latency_hist_fops = {
+ .open = latency_hist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static void clear_maxlatprocdata(struct maxlatproc_data *mp)
+{
+ mp->comm[0] = mp->current_comm[0] = '\0';
+ mp->prio = mp->current_prio = mp->pid = mp->current_pid =
+ mp->latency = mp->timeroffset = -1;
+ mp->timestamp = 0;
+}
+#endif
+
+static void hist_reset(struct hist_data *hist)
+{
+ atomic_dec(&hist->hist_mode);
+
+ memset(hist->hist_array, 0, sizeof(hist->hist_array));
+ hist->below_hist_bound_samples = 0ULL;
+ hist->above_hist_bound_samples = 0ULL;
+ hist->min_lat = LONG_MAX;
+ hist->max_lat = LONG_MIN;
+ hist->total_samples = 0ULL;
+ hist->accumulate_lat = 0LL;
+
+ atomic_inc(&hist->hist_mode);
+}
+
+static ssize_t
+latency_hist_reset(struct file *file, const char __user *a,
+ size_t size, loff_t *off)
+{
+ int cpu;
+ struct hist_data *hist = NULL;
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ struct maxlatproc_data *mp = NULL;
+#endif
+ off_t latency_type = (off_t) file->private_data;
+
+ for_each_online_cpu(cpu) {
+
+ switch (latency_type) {
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ case PREEMPTOFF_LATENCY:
+ hist = &per_cpu(preemptoff_hist, cpu);
+ break;
+#endif
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ case IRQSOFF_LATENCY:
+ hist = &per_cpu(irqsoff_hist, cpu);
+ break;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+ case PREEMPTIRQSOFF_LATENCY:
+ hist = &per_cpu(preemptirqsoff_hist, cpu);
+ break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ case WAKEUP_LATENCY:
+ hist = &per_cpu(wakeup_latency_hist, cpu);
+ mp = &per_cpu(wakeup_maxlatproc, cpu);
+ break;
+ case WAKEUP_LATENCY_SHAREDPRIO:
+ hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
+ mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
+ break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ case MISSED_TIMER_OFFSETS:
+ hist = &per_cpu(missed_timer_offsets, cpu);
+ mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
+ break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ case TIMERANDWAKEUP_LATENCY:
+ hist = &per_cpu(timerandwakeup_latency_hist, cpu);
+ mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
+ break;
+#endif
+ }
+
+ hist_reset(hist);
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ if (latency_type == WAKEUP_LATENCY ||
+ latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
+ latency_type == MISSED_TIMER_OFFSETS ||
+ latency_type == TIMERANDWAKEUP_LATENCY)
+ clear_maxlatprocdata(mp);
+#endif
+ }
+
+ return size;
+}
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+ unsigned long *this_pid = file->private_data;
+
+ r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t do_pid(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ unsigned long pid;
+ unsigned long *this_pid = file->private_data;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = '\0';
+
+ if (kstrtoul(buf, 10, &pid))
+ return -EINVAL;
+
+ *this_pid = pid;
+
+ return cnt;
+}
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static ssize_t
+show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ int r;
+ struct maxlatproc_data *mp = file->private_data;
+ int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
+ unsigned long long t;
+ unsigned long usecs, secs;
+ char *buf;
+
+ if (mp->pid == -1 || mp->current_pid == -1) {
+ buf = "(none)\n";
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf,
+ strlen(buf));
+ }
+
+ buf = kmalloc(strmaxlen, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ t = ns2usecs(mp->timestamp);
+ usecs = do_div(t, USEC_PER_SEC);
+ secs = (unsigned long) t;
+ r = snprintf(buf, strmaxlen,
+ "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
+ MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
+ mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
+ secs, usecs);
+ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ kfree(buf);
+ return r;
+}
+#endif
+
+static ssize_t
+show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ struct enable_data *ed = file->private_data;
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ long enable;
+ struct enable_data *ed = file->private_data;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (kstrtoul(buf, 10, &enable))
+ return -EINVAL;
+
+ if ((enable && ed->enabled) || (!enable && !ed->enabled))
+ return cnt;
+
+ if (enable) {
+ int ret;
+
+ switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+ case PREEMPTIRQSOFF_LATENCY:
+ ret = register_trace_preemptirqsoff_hist(
+ probe_preemptirqsoff_hist, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_preemptirqsoff_hist "
+ "to trace_preemptirqsoff_hist\n");
+ return ret;
+ }
+ break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ case WAKEUP_LATENCY:
+ ret = register_trace_sched_wakeup(
+ probe_wakeup_latency_hist_start, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_wakeup_latency_hist_start "
+ "to trace_sched_wakeup\n");
+ return ret;
+ }
+ ret = register_trace_sched_wakeup_new(
+ probe_wakeup_latency_hist_start, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_wakeup_latency_hist_start "
+ "to trace_sched_wakeup_new\n");
+ unregister_trace_sched_wakeup(
+ probe_wakeup_latency_hist_start, NULL);
+ return ret;
+ }
+ ret = register_trace_sched_switch(
+ probe_wakeup_latency_hist_stop, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_wakeup_latency_hist_stop "
+ "to trace_sched_switch\n");
+ unregister_trace_sched_wakeup(
+ probe_wakeup_latency_hist_start, NULL);
+ unregister_trace_sched_wakeup_new(
+ probe_wakeup_latency_hist_start, NULL);
+ return ret;
+ }
+ ret = register_trace_sched_migrate_task(
+ probe_sched_migrate_task, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_sched_migrate_task "
+ "to trace_sched_migrate_task\n");
+ unregister_trace_sched_wakeup(
+ probe_wakeup_latency_hist_start, NULL);
+ unregister_trace_sched_wakeup_new(
+ probe_wakeup_latency_hist_start, NULL);
+ unregister_trace_sched_switch(
+ probe_wakeup_latency_hist_stop, NULL);
+ return ret;
+ }
+ break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ case MISSED_TIMER_OFFSETS:
+ ret = register_trace_hrtimer_interrupt(
+ probe_hrtimer_interrupt, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't assign "
+ "probe_hrtimer_interrupt "
+ "to trace_hrtimer_interrupt\n");
+ return ret;
+ }
+ break;
+#endif
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ case TIMERANDWAKEUP_LATENCY:
+ if (!wakeup_latency_enabled_data.enabled ||
+ !missed_timer_offsets_enabled_data.enabled)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ break;
+ }
+ } else {
+ switch (ed->latency_type) {
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+ case PREEMPTIRQSOFF_LATENCY:
+ {
+ int cpu;
+
+ unregister_trace_preemptirqsoff_hist(
+ probe_preemptirqsoff_hist, NULL);
+ for_each_online_cpu(cpu) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ per_cpu(hist_irqsoff_counting,
+ cpu) = 0;
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ per_cpu(hist_preemptoff_counting,
+ cpu) = 0;
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+ per_cpu(hist_preemptirqsoff_counting,
+ cpu) = 0;
+#endif
+ }
+ }
+ break;
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ case WAKEUP_LATENCY:
+ {
+ int cpu;
+
+ unregister_trace_sched_wakeup(
+ probe_wakeup_latency_hist_start, NULL);
+ unregister_trace_sched_wakeup_new(
+ probe_wakeup_latency_hist_start, NULL);
+ unregister_trace_sched_switch(
+ probe_wakeup_latency_hist_stop, NULL);
+ unregister_trace_sched_migrate_task(
+ probe_sched_migrate_task, NULL);
+
+ for_each_online_cpu(cpu) {
+ per_cpu(wakeup_task, cpu) = NULL;
+ per_cpu(wakeup_sharedprio, cpu) = 0;
+ }
+ }
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ timerandwakeup_enabled_data.enabled = 0;
+#endif
+ break;
+#endif
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ case MISSED_TIMER_OFFSETS:
+ unregister_trace_hrtimer_interrupt(
+ probe_hrtimer_interrupt, NULL);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ timerandwakeup_enabled_data.enabled = 0;
+#endif
+ break;
+#endif
+ default:
+ break;
+ }
+ }
+ ed->enabled = enable;
+ return cnt;
+}
+
+static const struct file_operations latency_hist_reset_fops = {
+ .open = tracing_open_generic,
+ .write = latency_hist_reset,
+};
+
+static const struct file_operations enable_fops = {
+ .open = tracing_open_generic,
+ .read = show_enable,
+ .write = do_enable,
+};
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+static const struct file_operations pid_fops = {
+ .open = tracing_open_generic,
+ .read = show_pid,
+ .write = do_pid,
+};
+
+static const struct file_operations maxlatproc_fops = {
+ .open = tracing_open_generic,
+ .read = show_maxlatproc,
+};
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+static notrace void probe_preemptirqsoff_hist(void *v, int reason,
+ int starthist)
+{
+ int cpu = raw_smp_processor_id();
+ int time_set = 0;
+
+ if (starthist) {
+ cycle_t uninitialized_var(start);
+
+ if (!preempt_count() && !irqs_disabled())
+ return;
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ if ((reason == IRQS_OFF || reason == TRACE_START) &&
+ !per_cpu(hist_irqsoff_counting, cpu)) {
+ per_cpu(hist_irqsoff_counting, cpu) = 1;
+ start = ftrace_now(cpu);
+ time_set++;
+ per_cpu(hist_irqsoff_start, cpu) = start;
+ }
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
+ !per_cpu(hist_preemptoff_counting, cpu)) {
+ per_cpu(hist_preemptoff_counting, cpu) = 1;
+ if (!(time_set++))
+ start = ftrace_now(cpu);
+ per_cpu(hist_preemptoff_start, cpu) = start;
+ }
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+ if (per_cpu(hist_irqsoff_counting, cpu) &&
+ per_cpu(hist_preemptoff_counting, cpu) &&
+ !per_cpu(hist_preemptirqsoff_counting, cpu)) {
+ per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
+ if (!time_set)
+ start = ftrace_now(cpu);
+ per_cpu(hist_preemptirqsoff_start, cpu) = start;
+ }
+#endif
+ } else {
+ cycle_t uninitialized_var(stop);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ if ((reason == IRQS_ON || reason == TRACE_STOP) &&
+ per_cpu(hist_irqsoff_counting, cpu)) {
+ cycle_t start = per_cpu(hist_irqsoff_start, cpu);
+
+ stop = ftrace_now(cpu);
+ time_set++;
+ if (start) {
+ long latency = ((long) (stop - start)) /
+ NSECS_PER_USECS;
+
+ latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
+ stop, NULL);
+ }
+ per_cpu(hist_irqsoff_counting, cpu) = 0;
+ }
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
+ per_cpu(hist_preemptoff_counting, cpu)) {
+ cycle_t start = per_cpu(hist_preemptoff_start, cpu);
+
+ if (!(time_set++))
+ stop = ftrace_now(cpu);
+ if (start) {
+ long latency = ((long) (stop - start)) /
+ NSECS_PER_USECS;
+
+ latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
+ 0, stop, NULL);
+ }
+ per_cpu(hist_preemptoff_counting, cpu) = 0;
+ }
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+ if ((!per_cpu(hist_irqsoff_counting, cpu) ||
+ !per_cpu(hist_preemptoff_counting, cpu)) &&
+ per_cpu(hist_preemptirqsoff_counting, cpu)) {
+ cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
+
+ if (!time_set)
+ stop = ftrace_now(cpu);
+ if (start) {
+ long latency = ((long) (stop - start)) /
+ NSECS_PER_USECS;
+
+ latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
+ latency, 0, stop, NULL);
+ }
+ per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
+ }
+#endif
+ }
+}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
+ int cpu)
+{
+ int old_cpu = task_cpu(task);
+
+ if (cpu != old_cpu) {
+ unsigned long flags;
+ struct task_struct *cpu_wakeup_task;
+
+ raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+ cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
+ if (task == cpu_wakeup_task) {
+ put_task_struct(cpu_wakeup_task);
+ per_cpu(wakeup_task, old_cpu) = NULL;
+ cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
+ get_task_struct(cpu_wakeup_task);
+ }
+
+ raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+ }
+}
+
+static notrace void probe_wakeup_latency_hist_start(void *v,
+ struct task_struct *p, int success)
+{
+ unsigned long flags;
+ struct task_struct *curr = current;
+ int cpu = task_cpu(p);
+ struct task_struct *cpu_wakeup_task;
+
+ raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+ cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+ if (wakeup_pid) {
+ if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+ p->prio == curr->prio)
+ per_cpu(wakeup_sharedprio, cpu) = 1;
+ if (likely(wakeup_pid != task_pid_nr(p)))
+ goto out;
+ } else {
+ if (likely(!rt_task(p)) ||
+ (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
+ p->prio > curr->prio)
+ goto out;
+ if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
+ p->prio == curr->prio)
+ per_cpu(wakeup_sharedprio, cpu) = 1;
+ }
+
+ if (cpu_wakeup_task)
+ put_task_struct(cpu_wakeup_task);
+ cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
+ get_task_struct(cpu_wakeup_task);
+ cpu_wakeup_task->preempt_timestamp_hist =
+ ftrace_now(raw_smp_processor_id());
+out:
+ raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void probe_wakeup_latency_hist_stop(void *v,
+ struct task_struct *prev, struct task_struct *next)
+{
+ unsigned long flags;
+ int cpu = task_cpu(next);
+ long latency;
+ cycle_t stop;
+ struct task_struct *cpu_wakeup_task;
+
+ raw_spin_lock_irqsave(&wakeup_lock, flags);
+
+ cpu_wakeup_task = per_cpu(wakeup_task, cpu);
+
+ if (cpu_wakeup_task == NULL)
+ goto out;
+
+ /* Already running? */
+ if (unlikely(current == cpu_wakeup_task))
+ goto out_reset;
+
+ if (next != cpu_wakeup_task) {
+ if (next->prio < cpu_wakeup_task->prio)
+ goto out_reset;
+
+ if (next->prio == cpu_wakeup_task->prio)
+ per_cpu(wakeup_sharedprio, cpu) = 1;
+
+ goto out;
+ }
+
+ if (current->prio == cpu_wakeup_task->prio)
+ per_cpu(wakeup_sharedprio, cpu) = 1;
+
+ /*
+ * The task we are waiting for is about to be switched to.
+ * Calculate latency and store it in histogram.
+ */
+ stop = ftrace_now(raw_smp_processor_id());
+
+ latency = ((long) (stop - next->preempt_timestamp_hist)) /
+ NSECS_PER_USECS;
+
+ if (per_cpu(wakeup_sharedprio, cpu)) {
+ latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
+ next);
+ per_cpu(wakeup_sharedprio, cpu) = 0;
+ } else {
+ latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ if (timerandwakeup_enabled_data.enabled) {
+ latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
+ next->timer_offset + latency, next->timer_offset,
+ stop, next);
+ }
+#endif
+ }
+
+out_reset:
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ next->timer_offset = 0;
+#endif
+ put_task_struct(cpu_wakeup_task);
+ per_cpu(wakeup_task, cpu) = NULL;
+out:
+ raw_spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+static notrace void probe_hrtimer_interrupt(void *v, int cpu,
+ long long latency_ns, struct task_struct *curr,
+ struct task_struct *task)
+{
+ if (latency_ns <= 0 && task != NULL && rt_task(task) &&
+ (task->prio < curr->prio ||
+ (task->prio == curr->prio &&
+ !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
+ long latency;
+ cycle_t now;
+
+ if (missed_timer_offsets_pid) {
+ if (likely(missed_timer_offsets_pid !=
+ task_pid_nr(task)))
+ return;
+ }
+
+ now = ftrace_now(cpu);
+ latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
+ latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
+ task);
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ task->timer_offset = latency;
+#endif
+ }
+}
+#endif
+
+static __init int latency_hist_init(void)
+{
+ struct dentry *latency_hist_root = NULL;
+ struct dentry *dentry;
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ struct dentry *dentry_sharedprio;
+#endif
+ struct dentry *entry;
+ struct dentry *enable_root;
+ int i = 0;
+ struct hist_data *my_hist;
+ char name[64];
+ char *cpufmt = "CPU%d";
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ char *cpufmt_maxlatproc = "max_latency-CPU%d";
+ struct maxlatproc_data *mp = NULL;
+#endif
+
+ dentry = tracing_init_dentry();
+ latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
+ enable_root = debugfs_create_dir("enable", latency_hist_root);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+ dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(irqsoff_hist, i), &latency_hist_fops);
+ my_hist = &per_cpu(irqsoff_hist, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+ }
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+ dentry = debugfs_create_dir(preemptoff_hist_dir,
+ latency_hist_root);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(preemptoff_hist, i), &latency_hist_fops);
+ my_hist = &per_cpu(preemptoff_hist, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+ }
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+ dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
+ latency_hist_root);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
+ my_hist = &per_cpu(preemptirqsoff_hist, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+ }
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+ entry = debugfs_create_file("preemptirqsoff", 0644,
+ enable_root, (void *)&preemptirqsoff_enabled_data,
+ &enable_fops);
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+ dentry = debugfs_create_dir(wakeup_latency_hist_dir,
+ latency_hist_root);
+ dentry_sharedprio = debugfs_create_dir(
+ wakeup_latency_hist_dir_sharedprio, dentry);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(wakeup_latency_hist, i),
+ &latency_hist_fops);
+ my_hist = &per_cpu(wakeup_latency_hist, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+
+ entry = debugfs_create_file(name, 0444, dentry_sharedprio,
+ &per_cpu(wakeup_latency_hist_sharedprio, i),
+ &latency_hist_fops);
+ my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+
+ sprintf(name, cpufmt_maxlatproc, i);
+
+ mp = &per_cpu(wakeup_maxlatproc, i);
+ entry = debugfs_create_file(name, 0444, dentry, mp,
+ &maxlatproc_fops);
+ clear_maxlatprocdata(mp);
+
+ mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
+ entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
+ &maxlatproc_fops);
+ clear_maxlatprocdata(mp);
+ }
+ entry = debugfs_create_file("pid", 0644, dentry,
+ (void *)&wakeup_pid, &pid_fops);
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
+ entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
+ (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
+ entry = debugfs_create_file("wakeup", 0644,
+ enable_root, (void *)&wakeup_latency_enabled_data,
+ &enable_fops);
+#endif
+
+#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
+ dentry = debugfs_create_dir(missed_timer_offsets_dir,
+ latency_hist_root);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
+ my_hist = &per_cpu(missed_timer_offsets, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+
+ sprintf(name, cpufmt_maxlatproc, i);
+ mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
+ entry = debugfs_create_file(name, 0444, dentry, mp,
+ &maxlatproc_fops);
+ clear_maxlatprocdata(mp);
+ }
+ entry = debugfs_create_file("pid", 0644, dentry,
+ (void *)&missed_timer_offsets_pid, &pid_fops);
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
+ entry = debugfs_create_file("missed_timer_offsets", 0644,
+ enable_root, (void *)&missed_timer_offsets_enabled_data,
+ &enable_fops);
+#endif
+
+#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
+ defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
+ dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
+ latency_hist_root);
+ for_each_possible_cpu(i) {
+ sprintf(name, cpufmt, i);
+ entry = debugfs_create_file(name, 0444, dentry,
+ &per_cpu(timerandwakeup_latency_hist, i),
+ &latency_hist_fops);
+ my_hist = &per_cpu(timerandwakeup_latency_hist, i);
+ atomic_set(&my_hist->hist_mode, 1);
+ my_hist->min_lat = LONG_MAX;
+
+ sprintf(name, cpufmt_maxlatproc, i);
+ mp = &per_cpu(timerandwakeup_maxlatproc, i);
+ entry = debugfs_create_file(name, 0444, dentry, mp,
+ &maxlatproc_fops);
+ clear_maxlatprocdata(mp);
+ }
+ entry = debugfs_create_file("reset", 0644, dentry,
+ (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
+ entry = debugfs_create_file("timerandwakeup", 0644,
+ enable_root, (void *)&timerandwakeup_enabled_data,
+ &enable_fops);
+#endif
+ return 0;
+}
+
+device_initcall(latency_hist_init);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 774a0807fe81..f87ef7362493 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1695,28 +1695,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
* We can't schedule on offline CPUs, but it's not necessary
* since we can change their buffer sizes without any race.
*/
+ migrate_disable();
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
if (!cpu_buffer->nr_pages_to_update)
continue;
/* The update must run on the CPU that is being updated. */
- preempt_disable();
if (cpu == smp_processor_id() || !cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- preempt_disable();
}
- preempt_enable();
}
+ migrate_enable();
/* wait for all the updates to complete */
for_each_buffer_cpu(buffer, cpu) {
@@ -1753,22 +1747,16 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- preempt_disable();
+ migrate_disable();
/* The update must run on the CPU that is being updated. */
if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- preempt_disable();
}
- preempt_enable();
+ migrate_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 71136720ffa1..9dcd3c0f174b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -468,7 +468,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
irq_flags, pc);
if (!event)
return 0;
@@ -1554,6 +1554,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
struct task_struct *tsk = current;
entry->preempt_count = pc & 0xff;
+ entry->preempt_lazy_count = preempt_lazy_count();
entry->pid = (tsk) ? tsk->pid : 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
@@ -1563,8 +1564,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+
+ entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -2470,14 +2474,17 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n");
+ seq_puts(m, "# / _-------=> irqs-off \n");
+ seq_puts(m, "# | / _------=> need-resched \n");
+ seq_puts(m, "# || / _-----=> need-resched_lazy \n");
+ seq_puts(m, "# ||| / _----=> hardirq/softirq \n");
+ seq_puts(m, "# |||| / _---=> preempt-depth \n");
+ seq_puts(m, "# ||||| / _--=> preempt-lazy-depth\n");
+ seq_puts(m, "# |||||| / _-=> migrate-disable \n");
+ seq_puts(m, "# ||||||| / delay \n");
+ seq_puts(m, "# cmd pid |||||||| time | caller \n");
+ seq_puts(m, "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2501,13 +2508,16 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n");
- seq_puts(m, "# / _----=> need-resched\n");
- seq_puts(m, "# | / _---=> hardirq/softirq\n");
- seq_puts(m, "# || / _--=> preempt-depth\n");
- seq_puts(m, "# ||| / delay\n");
- seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | |||| | |\n");
+ seq_puts(m, "# _-------=> irqs-off \n");
+ seq_puts(m, "# / _------=> need-resched \n");
+ seq_puts(m, "# |/ _-----=> need-resched_lazy \n");
+ seq_puts(m, "# ||/ _----=> hardirq/softirq \n");
+ seq_puts(m, "# |||/ _---=> preempt-depth \n");
+ seq_puts(m, "# ||||/ _--=> preempt-lazy-depth\n");
+ seq_puts(m, "# ||||| / _-=> migrate-disable \n");
+ seq_puts(m, "# |||||| / delay\n");
+ seq_puts(m, "# TASK-PID CPU# |||||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | |||||| | |\n");
}
void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c8bd809cbd1c..093bb57d586b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -118,6 +118,7 @@ struct kretprobe_trace_entry_head {
* NEED_RESCHED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
+ * NEED_RESCHED_LAZY - lazy reschedule is requested
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -126,6 +127,7 @@ enum trace_flag_type {
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x40,
};
#define TRACE_BUF_SIZE 1024
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e4c4efc4ba0d..203c6ce7ce79 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -160,6 +160,8 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+ __common_field(unsigned short, migrate_disable);
+ __common_field(unsigned short, padding);
return ret;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..2f4eb37815d8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include "trace.h"
+#include <trace/events/hist.h>
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -439,11 +440,13 @@ void start_critical_timings(void)
{
if (preempt_trace() || irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+ trace_preemptirqsoff_hist(TRACE_START, 1);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
void stop_critical_timings(void)
{
+ trace_preemptirqsoff_hist(TRACE_STOP, 0);
if (preempt_trace() || irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
@@ -453,6 +456,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
#ifdef CONFIG_PROVE_LOCKING
void time_hardirqs_on(unsigned long a0, unsigned long a1)
{
+ trace_preemptirqsoff_hist(IRQS_ON, 0);
if (!preempt_trace() && irq_trace())
stop_critical_timing(a0, a1);
}
@@ -461,6 +465,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(a0, a1);
+ trace_preemptirqsoff_hist(IRQS_OFF, 1);
}
#else /* !CONFIG_PROVE_LOCKING */
@@ -486,6 +491,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
*/
void trace_hardirqs_on(void)
{
+ trace_preemptirqsoff_hist(IRQS_ON, 0);
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
@@ -495,11 +501,13 @@ void trace_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+ trace_preemptirqsoff_hist(IRQS_OFF, 1);
}
EXPORT_SYMBOL(trace_hardirqs_off);
void trace_hardirqs_on_caller(unsigned long caller_addr)
{
+ trace_preemptirqsoff_hist(IRQS_ON, 0);
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
@@ -509,6 +517,7 @@ void trace_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
+ trace_preemptirqsoff_hist(IRQS_OFF, 1);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -518,12 +527,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
#ifdef CONFIG_PREEMPT_TRACER
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
+ trace_preemptirqsoff_hist(PREEMPT_ON, 0);
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
+ trace_preemptirqsoff_hist(PREEMPT_ON, 1);
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..e97e5cf0ebcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -606,6 +606,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
char hardsoft_irq;
char need_resched;
+ char need_resched_lazy;
char irqs_off;
int hardirq;
int softirq;
@@ -634,6 +635,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
need_resched = '.';
break;
}
+ need_resched_lazy =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
hardsoft_irq =
(hardirq && softirq) ? 'H' :
@@ -641,8 +644,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.';
- if (!trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq))
+ if (!trace_seq_printf(s, "%c%c%c%c",
+ irqs_off, need_resched, need_resched_lazy,
+ hardsoft_irq))
return 0;
if (entry->preempt_count)
@@ -650,6 +654,16 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
else
ret = trace_seq_putc(s, '.');
+ if (entry->preempt_lazy_count)
+ ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+ else
+ ret = trace_seq_putc(s, '.');
+
+ if (entry->migrate_disable)
+ ret = trace_seq_printf(s, "%x", entry->migrate_disable);
+ else
+ ret = trace_seq_putc(s, '.');
+
return ret;
}
diff --git a/kernel/user.c b/kernel/user.c
index c2bbb50f5a90..8e364b03c329 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -159,11 +159,11 @@ void free_uid(struct user_struct *up)
if (!up)
return;
- local_irq_save(flags);
+ local_irq_save_nort(flags);
if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
free_user(up, flags);
else
- local_irq_restore(flags);
+ local_irq_restore_nort(flags);
}
struct user_struct *alloc_uid(kuid_t uid)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c9b6f01bf853..f0bddfecc5de 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -205,6 +205,8 @@ static int is_softlockup(unsigned long touch_ts)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -239,10 +241,19 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
+ /*
+ * If early-printk is enabled then make sure we do not
+ * lock up in printk() and kill console logging:
+ */
+ printk_kill();
+
+ if (hardlockup_panic) {
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
+ } else {
+ raw_spin_lock(&watchdog_output_lock);
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ raw_spin_unlock(&watchdog_output_lock);
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -346,6 +357,7 @@ static void watchdog_enable(unsigned int cpu)
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
+ hrtimer->irqsafe = 1;
/* Enable the perf event */
watchdog_nmi_enable(cpu);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b4defdecec8a..c1c3b7e299a0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,8 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/locallock.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
@@ -124,16 +126,21 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
+ * On RT we need the extra protection via rt_lock_idle_list() for
+ * the list manipulations against read access from
+ * wq_worker_sleeping(). All other places are nicely serialized via
+ * pool->lock.
+ *
* MG: pool->manager_mutex and pool->lock protected. Writes require both
* locks. Reads can happen under either lock.
*
* PL: wq_pool_mutex protected.
*
- * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
+ * PR: wq_pool_mutex protected for writes. RCU protected for reads.
*
* WQ: wq->mutex protected.
*
- * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
+ * WR: wq->mutex protected for writes. RCU protected for reads.
*
* MD: wq_mayday_lock protected.
*/
@@ -178,7 +185,7 @@ struct worker_pool {
atomic_t nr_running ____cacheline_aligned_in_smp;
/*
- * Destruction of pool is sched-RCU protected to allow dereferences
+ * Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
@@ -207,7 +214,7 @@ struct pool_workqueue {
/*
* Release of unbound pwq is punted to system_wq. See put_pwq()
* and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also sched-RCU protected so that the first pwq can be
+ * itself is also RCU protected so that the first pwq can be
* determined without grabbing wq->mutex.
*/
struct work_struct unbound_release_work;
@@ -323,6 +330,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
+
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
const struct workqueue_attrs *from);
@@ -331,14 +340,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&wq_pool_mutex), \
- "sched RCU or wq_pool_mutex should be held")
+ "RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&wq->mutex), \
- "sched RCU or wq->mutex should be held")
+ "RCU or wq->mutex should be held")
#ifdef CONFIG_LOCKDEP
#define assert_manager_or_pool_lock(pool) \
@@ -360,7 +369,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
* @pool: iteration cursor
* @pi: integer used for iteration
*
- * This must be called either with wq_pool_mutex held or sched RCU read
+ * This must be called either with wq_pool_mutex held or RCU read
* locked. If the pool needs to be used beyond the locking in effect, the
* caller is responsible for guaranteeing that the pool stays online.
*
@@ -393,7 +402,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
* @pwq: iteration cursor
* @wq: the target workqueue
*
- * This must be called either with wq->mutex held or sched RCU read locked.
+ * This must be called either with wq->mutex held or RCU read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
@@ -405,6 +414,31 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
else
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void rt_lock_idle_list(struct worker_pool *pool)
+{
+ preempt_disable();
+}
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
+{
+ preempt_enable();
+}
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
+#else
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
+static inline void sched_lock_idle_list(struct worker_pool *pool)
+{
+ spin_lock_irq(&pool->lock);
+}
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
+{
+ spin_unlock_irq(&pool->lock);
+}
+#endif
+
+
#ifdef CONFIG_DEBUG_OBJECTS_WORK
static struct debug_obj_descr work_debug_descr;
@@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
* @wq: the target workqueue
* @node: the node ID
*
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called either with pwq_lock held or RCU read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
@@ -652,8 +686,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
* @work: the work item of interest
*
* Pools are created and destroyed under wq_pool_mutex, and allows read
- * access under sched-RCU read lock. As such, this function should be
- * called under wq_pool_mutex or with preemption disabled.
+ * access under RCU read lock. As such, this function should be
+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
*
* All fields of the returned pool are accessible as long as the above
* mentioned locking is in effect. If the returned pool needs to be used
@@ -804,51 +838,44 @@ static struct worker *first_worker(struct worker_pool *pool)
*/
static void wake_up_worker(struct worker_pool *pool)
{
- struct worker *worker = first_worker(pool);
+ struct worker *worker;
+
+ rt_lock_idle_list(pool);
+
+ worker = first_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
+
+ rt_unlock_idle_list(pool);
}
/**
- * wq_worker_waking_up - a worker is waking up
- * @task: task waking up
- * @cpu: CPU @task is waking up to
- *
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
+ * wq_worker_running - a worker is running again
+ * @task: task returning from sleep
*
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
- if (!(worker->flags & WORKER_NOT_RUNNING)) {
- WARN_ON_ONCE(worker->pool->cpu != cpu);
+ if (!worker->sleeping)
+ return;
+ if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
- }
+ worker->sleeping = 0;
}
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
- * @cpu: CPU in question, must be the current CPU number
- *
- * This function is called during schedule() when a busy worker is
- * going to sleep. Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
+void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct worker *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -857,29 +884,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
* checking NOT_RUNNING.
*/
if (worker->flags & WORKER_NOT_RUNNING)
- return NULL;
+ return;
pool = worker->pool;
- /* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
- return NULL;
+ if (WARN_ON_ONCE(worker->sleeping))
+ return;
+
+ worker->sleeping = 1;
/*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
- *
- * NOT_RUNNING is clear. This means that we're bound to and
- * running on the local cpu w/ rq lock held and preemption
- * disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without pool
- * lock is safe.
*/
if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- to_wakeup = first_worker(pool);
- return to_wakeup ? to_wakeup->task : NULL;
+ !list_empty(&pool->worklist)) {
+ sched_lock_idle_list(pool);
+ wake_up_worker(pool);
+ sched_unlock_idle_list(pool);
+ }
}
/**
@@ -1086,12 +1110,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
if (pwq) {
/*
- * As both pwqs and pools are sched-RCU protected, the
+ * As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
- spin_lock_irq(&pwq->pool->lock);
+ local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
put_pwq(pwq);
- spin_unlock_irq(&pwq->pool->lock);
+ local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
}
}
@@ -1193,7 +1217,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
struct worker_pool *pool;
struct pool_workqueue *pwq;
- local_irq_save(*flags);
+ local_lock_irqsave(pendingb_lock, *flags);
/* try to steal the timer if it exists */
if (is_dwork) {
@@ -1212,6 +1236,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
return 0;
+ rcu_read_lock();
/*
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
@@ -1250,14 +1275,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
set_work_pool_and_keep_pending(work, pool->id);
spin_unlock(&pool->lock);
+ rcu_read_unlock();
return 1;
}
spin_unlock(&pool->lock);
fail:
- local_irq_restore(*flags);
+ rcu_read_unlock();
+ local_unlock_irqrestore(pendingb_lock, *flags);
if (work_is_canceling(work))
return -ENOENT;
- cpu_relax();
+ cpu_chill();
return -EAGAIN;
}
@@ -1326,7 +1353,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE_NONRT(!irqs_disabled());
debug_work_activate(work);
@@ -1334,6 +1361,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
+
+ rcu_read_lock();
retry:
if (req_cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
@@ -1390,10 +1419,8 @@ retry:
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
- if (WARN_ON(!list_empty(&work->entry))) {
- spin_unlock(&pwq->pool->lock);
- return;
- }
+ if (WARN_ON(!list_empty(&work->entry)))
+ goto out;
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
@@ -1409,7 +1436,9 @@ retry:
insert_work(pwq, work, worklist, work_flags);
+out:
spin_unlock(&pwq->pool->lock);
+ rcu_read_unlock();
}
/**
@@ -1429,14 +1458,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
bool ret = false;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(pendingb_lock,flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pendingb_lock, flags);
return ret;
}
EXPORT_SYMBOL(queue_work_on);
@@ -1503,14 +1532,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
unsigned long flags;
/* read the comment in __queue_work() */
- local_irq_save(flags);
+ local_lock_irqsave(pendingb_lock, flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_delayed_work(cpu, wq, dwork, delay);
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pendingb_lock, flags);
return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);
@@ -1545,7 +1574,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
if (likely(ret >= 0)) {
__queue_delayed_work(cpu, wq, dwork, delay);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pendingb_lock, flags);
}
/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -1578,7 +1607,9 @@ static void worker_enter_idle(struct worker *worker)
worker->last_active = jiffies;
/* idle_list is LIFO */
+ rt_lock_idle_list(pool);
list_add(&worker->entry, &pool->idle_list);
+ rt_unlock_idle_list(pool);
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1611,7 +1642,9 @@ static void worker_leave_idle(struct worker *worker)
return;
worker_clr_flags(worker, WORKER_IDLE);
pool->nr_idle--;
+ rt_lock_idle_list(pool);
list_del_init(&worker->entry);
+ rt_unlock_idle_list(pool);
}
/**
@@ -1857,7 +1890,9 @@ static void destroy_worker(struct worker *worker)
*/
get_task_struct(worker->task);
+ rt_lock_idle_list(pool);
list_del_init(&worker->entry);
+ rt_unlock_idle_list(pool);
worker->flags |= WORKER_DIE;
idr_remove(&pool->worker_idr, worker->id);
@@ -2840,14 +2875,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
might_sleep();
- local_irq_disable();
+ rcu_read_lock();
pool = get_work_pool(work);
if (!pool) {
- local_irq_enable();
+ rcu_read_unlock();
return false;
}
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -2874,10 +2909,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
else
lock_map_acquire_read(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
-
+ rcu_read_unlock();
return true;
already_gone:
spin_unlock_irq(&pool->lock);
+ rcu_read_unlock();
return false;
}
@@ -2926,7 +2962,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
/* tell other tasks trying to grab @work to back off */
mark_work_canceling(work);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pendingb_lock, flags);
flush_work(work);
clear_work_data(work);
@@ -2971,10 +3007,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
*/
bool flush_delayed_work(struct delayed_work *dwork)
{
- local_irq_disable();
+ local_lock_irq(pendingb_lock);
if (del_timer_sync(&dwork->timer))
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
- local_irq_enable();
+ local_unlock_irq(pendingb_lock);
return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);
@@ -3009,7 +3045,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
set_work_pool_and_clear_pending(&dwork->work,
get_work_pool_id(&dwork->work));
- local_irq_restore(flags);
+ local_unlock_irqrestore(pendingb_lock, flags);
return ret;
}
EXPORT_SYMBOL(cancel_delayed_work);
@@ -3195,7 +3231,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
const char *delim = "";
int node, written = 0;
- rcu_read_lock_sched();
+ get_online_cpus();
+ rcu_read_lock();
for_each_node(node) {
written += scnprintf(buf + written, PAGE_SIZE - written,
"%s%d:%d", delim, node,
@@ -3203,7 +3240,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
delim = " ";
}
written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
+ rcu_read_unlock();
+ put_online_cpus();
return written;
}
@@ -3570,7 +3608,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
*
- * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
* safe manner. get_unbound_pool() calls this function on its failure path
* and this function should be able to release pools which went through,
* successfully or not, init_worker_pool().
@@ -3617,8 +3655,8 @@ static void put_unbound_pool(struct worker_pool *pool)
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
- /* sched-RCU protected to allow dereferences from get_work_pool() */
- call_rcu_sched(&pool->rcu, rcu_free_pool);
+ /* RCU protected to allow dereferences from get_work_pool() */
+ call_rcu(&pool->rcu, rcu_free_pool);
}
/**
@@ -3731,7 +3769,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
put_unbound_pool(pool);
mutex_unlock(&wq_pool_mutex);
- call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+ call_rcu(&pwq->rcu, rcu_free_pwq);
/*
* If we're the last pwq going away, @wq is already dead and no one
@@ -4445,7 +4483,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
struct pool_workqueue *pwq;
bool ret;
- rcu_read_lock_sched();
+ rcu_read_lock();
+ preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
@@ -4456,7 +4495,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
ret = !list_empty(&pwq->delayed_works);
- rcu_read_unlock_sched();
+ preempt_enable();
+ rcu_read_unlock();
return ret;
}
@@ -4482,16 +4522,15 @@ unsigned int work_busy(struct work_struct *work)
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
- local_irq_save(flags);
+ rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
}
- local_irq_restore(flags);
-
+ rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(work_busy);
@@ -4939,16 +4978,16 @@ bool freeze_workqueues_busy(void)
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
- rcu_read_lock_sched();
+ rcu_read_lock();
for_each_pwq(pwq, wq) {
WARN_ON_ONCE(pwq->nr_active < 0);
if (pwq->nr_active) {
busy = true;
- rcu_read_unlock_sched();
+ rcu_read_unlock();
goto out_unlock;
}
}
- rcu_read_unlock_sched();
+ rcu_read_unlock();
}
out_unlock:
mutex_unlock(&wq_pool_mutex);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..2bb5b5a9a97c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -41,6 +41,7 @@ struct worker {
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
+ int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -66,7 +67,7 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched/core.c and workqueue.c.
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */