aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c558
1 files changed, 464 insertions, 94 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e973efc036e..71c0bb1cdeb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -272,7 +272,11 @@ late_initcall(sched_init_debug);
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
+#ifndef CONFIG_PREEMPT_RT_FULL
const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
/*
* period over which we average the RT time consumption, measured
@@ -489,6 +493,7 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
+ rq->hrtick_timer.irqsafe = 1;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -534,6 +539,37 @@ void resched_task(struct task_struct *p)
smp_send_reschedule(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_task_lazy(struct task_struct *p)
+{
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_task(p);
+ return;
+ }
+
+ lockdep_assert_held(&task_rq(p)->lock);
+
+ if (test_tsk_need_resched(p))
+ return;
+
+ if (test_tsk_need_resched_lazy(p))
+ return;
+
+ set_tsk_need_resched_lazy(p);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(p))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -660,17 +696,34 @@ static inline bool got_nohz_idle_kick(void)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+
+static int ksoftirqd_running(void)
+{
+ struct task_struct *softirqd;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
+ return 0;
+ softirqd = this_cpu_ksoftirqd();
+ if (softirqd && softirqd->on_rq)
+ return 1;
+ return 0;
+}
+
bool sched_can_stop_tick(void)
{
- struct rq *rq;
+ struct rq *rq;
- rq = this_rq();
+ rq = this_rq();
- /* Make sure rq->nr_running update is visible after the IPI */
- smp_rmb();
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
- /* More than one running task need preemption */
- if (rq->nr_running > 1)
+ /*
+ * More than one running task need preemption
+ *
+ * NOTE, RT: if ksoftirqd is awake, subtract it.
+ */
+ if (rq->nr_running - ksoftirqd_running() > 1)
return false;
return true;
@@ -1122,6 +1175,18 @@ struct migration_arg {
static int migration_cpu_stop(void *data);
+static bool check_task_state(struct task_struct *p, long match_state)
+{
+ bool match = false;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ if (p->state == match_state || p->saved_state == match_state)
+ match = true;
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+}
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1166,7 +1231,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ if (match_state && !check_task_state(p, match_state))
return 0;
cpu_relax();
}
@@ -1181,7 +1246,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
on_rq = p->on_rq;
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if (!match_state || p->state == match_state
+ || p->saved_state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &flags);
@@ -1406,10 +1472,6 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = 1;
-
- /* if a worker is waking up, notify workqueue */
- if (p->flags & PF_WQ_WORKER)
- wq_worker_waking_up(p, cpu_of(rq));
}
/*
@@ -1592,8 +1654,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
- if (!(p->state & state))
+ if (!(p->state & state)) {
+ /*
+ * The task might be running due to a spinlock sleeper
+ * wakeup. Check the saved state and set it to running
+ * if the wakeup condition is true.
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ success = 1;
+ }
+ }
goto out;
+ }
+
+ /*
+ * If this is a regular wakeup, then we can unconditionally
+ * clear the saved state of a "lock sleeper".
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER))
+ p->saved_state = TASK_RUNNING;
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1636,42 +1717,6 @@ out:
}
/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
- struct rq *rq = task_rq(p);
-
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
- return;
-
- lockdep_assert_held(&rq->lock);
-
- if (!raw_spin_trylock(&p->pi_lock)) {
- raw_spin_unlock(&rq->lock);
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- }
-
- if (!(p->state & TASK_NORMAL))
- goto out;
-
- if (!p->on_rq)
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
- ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
-out:
- raw_spin_unlock(&p->pi_lock);
-}
-
-/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -1685,11 +1730,23 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- WARN_ON(task_is_stopped_or_traced(p));
+ WARN_ON(__task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
@@ -1867,6 +1924,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -2150,11 +2210,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
+ /*
+ * We use mmdrop_delayed() here so we don't have to do the
+ * full __mmdrop() when we are the last user.
+ */
if (mm)
- mmdrop(mm);
+ mmdrop_delayed(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2514,8 +2576,13 @@ void __kprobes preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
EXPORT_SYMBOL(preempt_count_add);
@@ -2558,6 +2625,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -2581,6 +2655,133 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */
+#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
+#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
+
+static inline void update_migrate_disable(struct task_struct *p)
+{
+ const struct cpumask *mask;
+
+ if (likely(!p->migrate_disable))
+ return;
+
+ /* Did we already update affinity? */
+ if (unlikely(migrate_disabled_updated(p)))
+ return;
+
+ /*
+ * Since this is always current we can get away with only locking
+ * rq->lock, the ->cpus_allowed value can normally only be changed
+ * while holding both p->pi_lock and rq->lock, but seeing that this
+ * is current, we cannot actually be waking up, so all code that
+ * relies on serialization against p->pi_lock is out of scope.
+ *
+ * Having rq->lock serializes us against things like
+ * set_cpus_allowed_ptr() that can still happen concurrently.
+ */
+ mask = tsk_cpus_allowed(p);
+
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, mask);
+ /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */
+ p->nr_cpus_allowed = 1;
+
+ /* Let migrate_enable know to fix things back up */
+ p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic++;
+#endif
+ return;
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (unlikely(p->migrate_disable_atomic)) {
+ tracing_off();
+ WARN_ON_ONCE(1);
+ }
+#endif
+
+ if (p->migrate_disable) {
+ p->migrate_disable++;
+ return;
+ }
+
+ preempt_disable();
+ preempt_lazy_disable();
+ pin_current_cpu();
+ p->migrate_disable = 1;
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+ const struct cpumask *mask;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (in_atomic()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic--;
+#endif
+ return;
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (unlikely(p->migrate_disable_atomic)) {
+ tracing_off();
+ WARN_ON_ONCE(1);
+ }
+#endif
+ WARN_ON_ONCE(p->migrate_disable <= 0);
+
+ if (migrate_disable_count(p) > 1) {
+ p->migrate_disable--;
+ return;
+ }
+
+ preempt_disable();
+ if (unlikely(migrate_disabled_updated(p))) {
+ /*
+ * Undo whatever update_migrate_disable() did, also see there
+ * about locking.
+ */
+ rq = this_rq();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * Clearing migrate_disable causes tsk_cpus_allowed to
+ * show the tasks original cpu affinity.
+ */
+ p->migrate_disable = 0;
+ mask = tsk_cpus_allowed(p);
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, mask);
+ p->nr_cpus_allowed = cpumask_weight(mask);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ } else
+ p->migrate_disable = 0;
+
+ unpin_current_cpu();
+ preempt_enable();
+ preempt_lazy_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static inline void update_migrate_disable(struct task_struct *p) { }
+#define migrate_disabled_updated(p) 0
+#endif
+
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
if (prev->on_rq || rq->skip_clock_update < 0)
@@ -2680,6 +2881,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ update_migrate_disable(prev);
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2687,19 +2890,6 @@ need_resched:
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
-
- /*
- * If a worker went to sleep, notify and ask workqueue
- * whether it wants to wake up a task to maintain
- * concurrency.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
- to_wakeup = wq_worker_sleeping(prev, cpu);
- if (to_wakeup)
- try_to_wake_up_local(to_wakeup);
- }
}
switch_count = &prev->nvcsw;
}
@@ -2712,6 +2902,7 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -2741,8 +2932,19 @@ need_resched:
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
+ return;
+ /*
+ * If a worker went to sleep, notify and ask workqueue whether
+ * it wants to wake up a task to maintain concurrency.
+ */
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+
+
+ if (tsk_is_pi_blocked(tsk))
return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -2751,12 +2953,19 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
+static inline void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+}
+
asmlinkage void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
__schedule();
+ sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -2802,9 +3011,26 @@ asmlinkage void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
+#ifdef CONFIG_PREEMPT_LAZY
+ /*
+ * Check for lazy preemption
+ */
+ if (current_thread_info()->preempt_lazy_count &&
+ !test_thread_flag(TIF_NEED_RESCHED))
+ return;
+#endif
do {
__preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * The add/subtract must not be traced by the function
+ * tracer. But we still want to account for the
+ * preempt off latency tracer. Since the _notrace versions
+ * of add/subtract skip the accounting for latency tracer
+ * we must force it manually.
+ */
+ start_critical_timings();
__schedule();
+ stop_critical_timings();
__preempt_count_sub(PREEMPT_ACTIVE);
/*
@@ -2912,7 +3138,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
@@ -3195,9 +3422,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_new = 1;
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
@@ -3217,9 +3443,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* getparam()/getattr() don't report silly values for !rt tasks.
*/
p->rt_priority = attr->sched_priority;
-
p->normal_prio = normal_prio(p);
- p->prio = rt_mutex_getprio(p);
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3227,8 +3465,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
-
- set_load_weight(p);
}
static void
@@ -3304,6 +3540,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
int policy = attr->sched_policy;
unsigned long flags;
@@ -3428,6 +3666,7 @@ recheck:
if (dl_policy(policy))
goto change;
+ p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -3481,6 +3720,24 @@ change:
return -EBUSY;
}
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -3488,16 +3745,18 @@ change:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4106,9 +4365,16 @@ SYSCALL_DEFINE0(sched_yield)
static void __cond_resched(void)
{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ /*
+ * Check again in case we missed a preemption
+ * opportunity between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
}
int __sched _cond_resched(void)
@@ -4149,6 +4415,7 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
+#ifndef CONFIG_PREEMPT_RT_FULL
int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
@@ -4162,6 +4429,7 @@ int __sched __cond_resched_softirq(void)
return 0;
}
EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
/**
* yield - yield the current processor to other threads.
@@ -4515,6 +4783,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4522,7 +4791,9 @@ void init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
@@ -4537,11 +4808,90 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
+ if (!migrate_disabled_updated(p)) {
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+ }
cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+ mutex_lock(&sched_down_mutex);
+ cpumask_set_cpu(cpu, &sched_down_cpumask);
+ mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+ mutex_lock(&sched_down_mutex);
+ cpumask_clear_cpu(cpu, &sched_down_cpumask);
+ mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ * 0 otherwise.
+ */
+int migrate_me(void)
+{
+ struct task_struct *p = current;
+ struct migration_arg arg;
+ struct cpumask *cpumask;
+ struct cpumask *mask;
+ unsigned long flags;
+ unsigned int dest_cpu;
+ struct rq *rq;
+
+ /*
+ * We can not migrate tasks bounded to a CPU or tasks not
+ * running. The movement of the task will wake it up.
+ */
+ if (p->flags & PF_NO_SETAFFINITY || p->state)
+ return 0;
+
+ mutex_lock(&sched_down_mutex);
+ rq = task_rq_lock(p, &flags);
+
+ cpumask = &__get_cpu_var(sched_cpumasks);
+ mask = &p->cpus_allowed;
+
+ cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+ if (!cpumask_weight(cpumask)) {
+ /* It's only on this CPU? */
+ task_rq_unlock(rq, p, &flags);
+ mutex_unlock(&sched_down_mutex);
+ return 0;
+ }
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+ arg.task = p;
+ arg.dest_cpu = dest_cpu;
+
+ task_rq_unlock(rq, p, &flags);
+
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ mutex_unlock(&sched_down_mutex);
+
+ return 1;
}
/*
@@ -4587,7 +4937,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
@@ -4724,6 +5074,8 @@ static int migration_cpu_stop(void *data)
#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -4736,7 +5088,12 @@ void idle_task_exit(void)
if (mm != &init_mm)
switch_mm(mm, &init_mm, current);
- mmdrop(mm);
+
+ /*
+ * Defer the cleanup to an alive cpu. On RT we can neither
+ * call mmdrop() nor mmdrop_delayed() from here.
+ */
+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
}
/*
@@ -5060,6 +5417,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD:
calc_load_migrate(rq);
+ if (per_cpu(idle_last_mm, cpu)) {
+ mmdrop(per_cpu(idle_last_mm, cpu));
+ per_cpu(idle_last_mm, cpu) = NULL;
+ }
break;
#endif
}
@@ -6968,7 +7329,8 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+ int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
+ sched_rcu_preempt_depth();
return (nested == preempt_offset);
}
@@ -6978,7 +7340,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6996,6 +7359,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);