diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 558 |
1 files changed, 464 insertions, 94 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e973efc036e..71c0bb1cdeb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -272,7 +272,11 @@ late_initcall(sched_init_debug); * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifndef CONFIG_PREEMPT_RT_FULL const_debug unsigned int sysctl_sched_nr_migrate = 32; +#else +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#endif /* * period over which we average the RT time consumption, measured @@ -489,6 +493,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.irqsafe = 1; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -534,6 +539,37 @@ void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } +#ifdef CONFIG_PREEMPT_LAZY +void resched_task_lazy(struct task_struct *p) +{ + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_task(p); + return; + } + + lockdep_assert_held(&task_rq(p)->lock); + + if (test_tsk_need_resched(p)) + return; + + if (test_tsk_need_resched_lazy(p)) + return; + + set_tsk_need_resched_lazy(p); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -660,17 +696,34 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL + +static int ksoftirqd_running(void) +{ + struct task_struct *softirqd; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) + return 0; + softirqd = this_cpu_ksoftirqd(); + if (softirqd && softirqd->on_rq) + return 1; + return 0; +} + bool sched_can_stop_tick(void) { - struct rq *rq; + struct rq *rq; - rq = this_rq(); + rq = this_rq(); - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); - /* More than one running task need preemption */ - if (rq->nr_running > 1) + /* + * More than one running task need preemption + * + * NOTE, RT: if ksoftirqd is awake, subtract it. + */ + if (rq->nr_running - ksoftirqd_running() > 1) return false; return true; @@ -1122,6 +1175,18 @@ struct migration_arg { static int migration_cpu_stop(void *data); +static bool check_task_state(struct task_struct *p, long match_state) +{ + bool match = false; + + raw_spin_lock_irq(&p->pi_lock); + if (p->state == match_state || p->saved_state == match_state) + match = true; + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * wait_task_inactive - wait for a thread to unschedule. * @@ -1166,7 +1231,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) + if (match_state && !check_task_state(p, match_state)) return 0; cpu_relax(); } @@ -1181,7 +1246,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->on_rq; ncsw = 0; - if (!match_state || p->state == match_state) + if (!match_state || p->state == match_state + || p->saved_state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &flags); @@ -1406,10 +1472,6 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = 1; - - /* if a worker is waking up, notify workqueue */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -1592,8 +1654,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) + if (!(p->state & state)) { + /* + * The task might be running due to a spinlock sleeper + * wakeup. Check the saved state and set it to running + * if the wakeup condition is true. + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) { + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + success = 1; + } + } goto out; + } + + /* + * If this is a regular wakeup, then we can unconditionally + * clear the saved state of a "lock sleeper". + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) + p->saved_state = TASK_RUNNING; success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1636,42 +1717,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - raw_spin_unlock(&rq->lock); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - if (!p->on_rq) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -1685,11 +1730,23 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); + WARN_ON(__task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" + * @p: The process to be woken up. + * + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate + * the nature of the wakeup. + */ +int wake_up_lock_sleeper(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); +} + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -1867,6 +1924,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @@ -2150,11 +2210,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2514,8 +2576,13 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); @@ -2558,6 +2625,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -2581,6 +2655,133 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) +#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */ +#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN) +#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN) + +static inline void update_migrate_disable(struct task_struct *p) +{ + const struct cpumask *mask; + + if (likely(!p->migrate_disable)) + return; + + /* Did we already update affinity? */ + if (unlikely(migrate_disabled_updated(p))) + return; + + /* + * Since this is always current we can get away with only locking + * rq->lock, the ->cpus_allowed value can normally only be changed + * while holding both p->pi_lock and rq->lock, but seeing that this + * is current, we cannot actually be waking up, so all code that + * relies on serialization against p->pi_lock is out of scope. + * + * Having rq->lock serializes us against things like + * set_cpus_allowed_ptr() that can still happen concurrently. + */ + mask = tsk_cpus_allowed(p); + + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */ + p->nr_cpus_allowed = 1; + + /* Let migrate_enable know to fix things back up */ + p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN; +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (in_atomic()) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic++; +#endif + return; + } + +#ifdef CONFIG_SCHED_DEBUG + if (unlikely(p->migrate_disable_atomic)) { + tracing_off(); + WARN_ON_ONCE(1); + } +#endif + + if (p->migrate_disable) { + p->migrate_disable++; + return; + } + + preempt_disable(); + preempt_lazy_disable(); + pin_current_cpu(); + p->migrate_disable = 1; + preempt_enable(); +} +EXPORT_SYMBOL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + const struct cpumask *mask; + unsigned long flags; + struct rq *rq; + + if (in_atomic()) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic--; +#endif + return; + } + +#ifdef CONFIG_SCHED_DEBUG + if (unlikely(p->migrate_disable_atomic)) { + tracing_off(); + WARN_ON_ONCE(1); + } +#endif + WARN_ON_ONCE(p->migrate_disable <= 0); + + if (migrate_disable_count(p) > 1) { + p->migrate_disable--; + return; + } + + preempt_disable(); + if (unlikely(migrate_disabled_updated(p))) { + /* + * Undo whatever update_migrate_disable() did, also see there + * about locking. + */ + rq = this_rq(); + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * Clearing migrate_disable causes tsk_cpus_allowed to + * show the tasks original cpu affinity. + */ + p->migrate_disable = 0; + mask = tsk_cpus_allowed(p); + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, mask); + p->nr_cpus_allowed = cpumask_weight(mask); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } else + p->migrate_disable = 0; + + unpin_current_cpu(); + preempt_enable(); + preempt_lazy_enable(); +} +EXPORT_SYMBOL(migrate_enable); +#else +static inline void update_migrate_disable(struct task_struct *p) { } +#define migrate_disabled_updated(p) 0 +#endif + static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->on_rq || rq->skip_clock_update < 0) @@ -2680,6 +2881,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + update_migrate_disable(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2687,19 +2890,6 @@ need_resched: } else { deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) - try_to_wake_up_local(to_wakeup); - } } switch_count = &prev->nvcsw; } @@ -2712,6 +2902,7 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -2741,8 +2932,19 @@ need_resched: static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) + return; + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + */ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + + + if (tsk_is_pi_blocked(tsk)) return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -2751,12 +2953,19 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static inline void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); __schedule(); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -2802,9 +3011,26 @@ asmlinkage void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; +#ifdef CONFIG_PREEMPT_LAZY + /* + * Check for lazy preemption + */ + if (current_thread_info()->preempt_lazy_count && + !test_thread_flag(TIF_NEED_RESCHED)) + return; +#endif do { __preempt_count_add(PREEMPT_ACTIVE); + /* + * The add/subtract must not be traced by the function + * tracer. But we still want to account for the + * preempt off latency tracer. Since the _notrace versions + * of add/subtract skip the accounting for latency tracer + * we must force it manually. + */ + start_critical_timings(); __schedule(); + stop_critical_timings(); __preempt_count_sub(PREEMPT_ACTIVE); /* @@ -2912,7 +3138,8 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -3195,9 +3422,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_new = 1; } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { int policy = attr->sched_policy; @@ -3217,9 +3443,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); + set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); + + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3227,8 +3465,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - - set_load_weight(p); } static void @@ -3304,6 +3540,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3428,6 +3666,7 @@ recheck: if (dl_policy(policy)) goto change; + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } @@ -3481,6 +3720,24 @@ change: return -EBUSY; } + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3488,16 +3745,18 @@ change: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4106,9 +4365,16 @@ SYSCALL_DEFINE0(sched_yield) static void __cond_resched(void) { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + /* + * Check again in case we missed a preemption + * opportunity between schedule and now. + */ + barrier(); + } while (need_resched()); } int __sched _cond_resched(void) @@ -4149,6 +4415,7 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +#ifndef CONFIG_PREEMPT_RT_FULL int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -4162,6 +4429,7 @@ int __sched __cond_resched_softirq(void) return 0; } EXPORT_SYMBOL(__cond_resched_softirq); +#endif /** * yield - yield the current processor to other threads. @@ -4515,6 +4783,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4522,7 +4791,9 @@ void init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -4537,11 +4808,90 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - + if (!migrate_disabled_updated(p)) { + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); + } cpumask_copy(&p->cpus_allowed, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); +static DEFINE_MUTEX(sched_down_mutex); +static cpumask_t sched_down_cpumask; + +void tell_sched_cpu_down_begin(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_set_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +void tell_sched_cpu_down_done(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_clear_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +/** + * migrate_me - try to move the current task off this cpu + * + * Used by the pin_current_cpu() code to try to get tasks + * to move off the current CPU as it is going down. + * It will only move the task if the task isn't pinned to + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) + * and the task has to be in a RUNNING state. Otherwise the + * movement of the task will wake it up (change its state + * to running) when the task did not expect it. + * + * Returns 1 if it succeeded in moving the current task + * 0 otherwise. + */ +int migrate_me(void) +{ + struct task_struct *p = current; + struct migration_arg arg; + struct cpumask *cpumask; + struct cpumask *mask; + unsigned long flags; + unsigned int dest_cpu; + struct rq *rq; + + /* + * We can not migrate tasks bounded to a CPU or tasks not + * running. The movement of the task will wake it up. + */ + if (p->flags & PF_NO_SETAFFINITY || p->state) + return 0; + + mutex_lock(&sched_down_mutex); + rq = task_rq_lock(p, &flags); + + cpumask = &__get_cpu_var(sched_cpumasks); + mask = &p->cpus_allowed; + + cpumask_andnot(cpumask, mask, &sched_down_cpumask); + + if (!cpumask_weight(cpumask)) { + /* It's only on this CPU? */ + task_rq_unlock(rq, p, &flags); + mutex_unlock(&sched_down_mutex); + return 0; + } + + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); + + arg.task = p; + arg.dest_cpu = dest_cpu; + + task_rq_unlock(rq, p, &flags); + + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + mutex_unlock(&sched_down_mutex); + + return 1; } /* @@ -4587,7 +4937,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); @@ -4724,6 +5074,8 @@ static int migration_cpu_stop(void *data) #ifdef CONFIG_HOTPLUG_CPU +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); + /* * Ensures that the idle task is using init_mm right before its cpu goes * offline. @@ -4736,7 +5088,12 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); - mmdrop(mm); + + /* + * Defer the cleanup to an alive cpu. On RT we can neither + * call mmdrop() nor mmdrop_delayed() from here. + */ + per_cpu(idle_last_mm, smp_processor_id()) = mm; } /* @@ -5060,6 +5417,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: calc_load_migrate(rq); + if (per_cpu(idle_last_mm, cpu)) { + mmdrop(per_cpu(idle_last_mm, cpu)); + per_cpu(idle_last_mm, cpu) = NULL; + } break; #endif } @@ -6968,7 +7329,8 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + + sched_rcu_preempt_depth(); return (nested == preempt_offset); } @@ -6978,7 +7340,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6996,6 +7359,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); |