aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c887
1 files changed, 625 insertions, 262 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0eb16373bf0..53725f94bf6d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
@@ -349,6 +350,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -364,6 +371,12 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -566,7 +579,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -1013,6 +1026,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1020,6 +1036,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1051,6 +1068,199 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(rq, cpu_rq(new_cpu));
+ set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ if (unlikely(!cpu_active(dest_cpu)))
+ return ret;
+
+ rq = cpu_rq(src_cpu);
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /* Already moved. */
+ if (task_cpu(p) != src_cpu)
+ goto done;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ goto fail;
+
+ /*
+ * If we're not on a rq, the next wake-up will ensure we're
+ * placed properly.
+ */
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+done:
+ ret = 1;
+fail:
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+ return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+ __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+ local_irq_enable();
+ return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ lockdep_assert_held(&p->pi_lock);
+
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1084,6 +1294,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ walt_fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1191,13 +1403,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1330,9 +1535,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1436,7 +1639,16 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1690,6 +1902,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
+#ifdef CONFIG_SMP
+ struct rq *rq;
+ u64 wallclock;
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1742,6 +1958,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1749,10 +1973,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
@@ -1791,8 +2017,13 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
@@ -1836,6 +2067,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1855,13 +2090,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2092,6 +2332,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2146,6 +2389,11 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+ walt_init_new_task_load(p);
+
+ /* Initialize new task's runnable average */
+ init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2155,10 +2403,9 @@ void wake_up_new_task(struct task_struct *p)
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Initialize new task's runnable average */
- init_task_runnable_average(p);
rq = __task_rq_lock(p);
- activate_task(rq, p, 0);
+ walt_mark_task_starting(p);
+ activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
@@ -2470,11 +2717,41 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2556,6 +2833,93 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
+{
+ cpu_capacity = cpu_capacity * capacity_margin;
+ cpu_capacity /= SCHED_CAPACITY_SCALE;
+ return cpu_capacity;
+}
+
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+ struct sched_capacity_reqs *scr)
+{
+ unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+ return total += scr->dl;
+}
+
+static void sched_freq_tick_pelt(int cpu)
+{
+ unsigned long cpu_utilization = capacity_max;
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+ if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+ return;
+
+ /*
+ * To make free room for a task that is building up its "real"
+ * utilization and to harm its performance the least, request
+ * a jump to a higher OPP as soon as the margin of free capacity
+ * is impacted (specified by capacity_margin).
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+ unsigned long cpu_utilization = cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+
+ if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+ return sched_freq_tick_pelt(cpu);
+
+ /*
+ * Add a margin to the WALT utilization.
+ * NOTE: WALT tracks a single CPU signal for all the scheduling
+ * classes, thus this margin is going to be added to the DL class as
+ * well, which is something we do not do in sched_freq_tick_pelt case.
+ */
+ cpu_utilization = add_capacity_margin(cpu_utilization);
+ if (cpu_utilization <= capacity_curr)
+ return;
+
+ /*
+ * It is likely that the load is growing so we
+ * keep the added margin in our request as an
+ * extra boost.
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
+static void sched_freq_tick(int cpu)
+{
+ unsigned long capacity_orig, capacity_curr;
+
+ if (!sched_freq())
+ return;
+
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_curr = capacity_curr_of(cpu);
+ if (capacity_curr == capacity_orig)
+ return;
+
+ _sched_freq_tick(cpu);
+}
+#else
+static inline void sched_freq_tick(int cpu) { }
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -2569,9 +2933,14 @@ void scheduler_tick(void)
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ walt_set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ calc_global_load_tick(rq);
+ sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2600,7 +2969,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2808,6 +3177,7 @@ static void __sched __schedule(void)
unsigned long *switch_count;
struct rq *rq;
int cpu;
+ u64 wallclock;
need_resched:
preempt_disable();
@@ -2857,6 +3227,9 @@ need_resched:
update_rq_clock(rq);
next = pick_next_task(rq, prev);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -3313,15 +3686,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -3736,6 +4125,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4116,7 +4506,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4432,36 +4822,26 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -4657,9 +5037,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
+
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4683,7 +5065,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4700,149 +5083,6 @@ void init_idle(struct task_struct *idle, int cpu)
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4890,35 +5130,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -5021,7 +5235,6 @@ static void migrate_tasks(unsigned int dead_cpu)
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5094,9 +5307,60 @@ set_table_entry(struct ctl_table *entry,
}
static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+ sge->nr_idle_states*sizeof(struct idle_state), 0644,
+ proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+ sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+ proc_doulongvec_minmax, false);
+
+ return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+ if (table == NULL)
+ return NULL;
+
+ table->procname = kstrdup("energy", GFP_KERNEL);
+ table->mode = 0555;
+ table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+ return table;
+}
+
+static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table;
+ unsigned int nr_entries = 14;
+
+ int i = 0;
+ struct sched_group *sg = sd->groups;
+
+ if (sg->sge) {
+ int nr_sgs = 0;
+
+ do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+ nr_entries += nr_sgs;
+ }
+
+ table = sd_alloc_ctl_entry(nr_entries);
if (table == NULL)
return NULL;
@@ -5129,7 +5393,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ sg = sd->groups;
+ if (sg->sge) {
+ char buf[32];
+ struct ctl_table *entry = &table[13];
+
+ do {
+ snprintf(buf, 32, "group%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_group_table(sg);
+ } while (entry++, i++, sg = sg->next, sg != sd->groups);
+ }
+ /* &table[nr_entries-1] is terminator */
return table;
}
@@ -5200,7 +5476,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5246,6 +5522,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_set_window_start(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
break;
@@ -5265,6 +5544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_migrate_sync_cpu(cpu);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -5378,9 +5658,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -5439,17 +5716,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5469,7 +5735,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %s", str);
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -5530,7 +5796,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5562,7 +5829,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5641,6 +5909,8 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
return 0;
free_rto_mask:
@@ -5746,11 +6016,13 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
+ struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -5771,6 +6043,17 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
@@ -5950,7 +6233,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6080,6 +6363,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
}
/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+ const struct cpumask *cpumask)
+{
+ const struct sched_group_energy * const sge = fn(cpu);
+ struct cpumask mask;
+ int i;
+
+ if (cpumask_weight(cpumask) <= 1)
+ return;
+
+ cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask) {
+ const struct sched_group_energy * const e = fn(i);
+ int y;
+
+ BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+ for (y = 0; y < (e->nr_idle_states); y++) {
+ BUG_ON(e->idle_states[y].power !=
+ sge->idle_states[y].power);
+ }
+
+ BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+ for (y = 0; y < (e->nr_cap_states); y++) {
+ BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+ BUG_ON(e->cap_states[y].power !=
+ sge->cap_states[y].power);
+ }
+ }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sd->groups))
+ return;
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+ sd->groups->sge = fn(cpu);
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -6185,6 +6528,7 @@ static int sched_domains_curr_level;
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -6194,7 +6538,8 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, int cpu)
@@ -6259,6 +6604,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6583,7 +6929,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -6687,6 +7033,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
+ struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6725,10 +7072,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -6737,6 +7087,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
@@ -7001,6 +7352,7 @@ void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
+ walt_init_cpu_efficiency();
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7056,6 +7408,9 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7116,11 +7471,12 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
@@ -7173,7 +7529,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7183,6 +7539,11 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7211,6 +7572,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7220,11 +7586,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7246,14 +7607,24 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -7402,7 +7773,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
@@ -7410,7 +7781,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -7505,7 +7876,7 @@ void sched_move_task(struct task_struct *tsk)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, queued);
+ tsk->sched_class->task_move_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -7969,14 +8340,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task)
{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
-
sched_move_task(task);
}