aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c239
1 files changed, 115 insertions, 124 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3e5b54715b..dda9b194d225 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -191,7 +191,7 @@ static void update_sysctl(void)
#undef SET_SYSCTL
}
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
{
update_sysctl();
}
@@ -1094,7 +1094,7 @@ struct numa_group {
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
- unsigned long faults[0];
+ unsigned long faults[];
};
/*
@@ -3441,52 +3441,46 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+ se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
}
static inline void
@@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
unsigned long load_avg;
u64 load_sum = 0;
s64 delta_sum;
+ u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+ load_avg = div_s64(load_sum, divider);
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
delta_avg = load_avg - se->avg.load_avg;
@@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/*
@@ -3873,6 +3878,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -4054,7 +4061,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -4588,16 +4595,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4609,13 +4616,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4723,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4744,29 +4783,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);
- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
- /*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
- */
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
/*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4933,14 +4956,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/*
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
- cfs_b->distribute_running = 1;
+ while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
@@ -5054,10 +5075,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
- if (cfs_b->distribute_running) {
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- return;
- }
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5067,9 +5084,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- if (runtime)
- cfs_b->distribute_running = 1;
-
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
@@ -5078,7 +5092,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -5139,8 +5152,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5170,6 +5182,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -5199,8 +5213,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5221,7 +5233,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
- cfs_b->distribute_running = 0;
cfs_b->slack_started = false;
}
@@ -5506,28 +5517,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
list_add_leaf_cfs_rq(cfs_rq);
}
-enqueue_throttle:
- if (!se) {
- add_nr_running(rq, 1);
- /*
- * Since new tasks are assigned an initial util_avg equal to
- * half of the spare capacity of their CPU, tiny tasks have the
- * ability to cross the overutilized threshold, which will
- * result in the load balancer ruining all the task placement
- * done by EAS. As a way to mitigate that effect, do not account
- * for the first enqueue operation of new tasks during the
- * overutilized flag detection.
- *
- * A better way of solving this problem would be to wait for
- * the PELT signals of tasks to converge before taking them
- * into account, but that is not straightforward to implement,
- * and the following generally works well enough in practice.
- */
- if (flags & ENQUEUE_WAKEUP)
- update_overutilized_status(rq);
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, 1);
- }
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5737,7 +5747,7 @@ static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
@@ -5846,8 +5856,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5930,7 +5939,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
+ group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
@@ -6671,9 +6680,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
rcu_read_lock();
for_each_domain(cpu, tmp) {
- if (!(tmp->flags & SD_LOAD_BALANCE))
- break;
-
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
@@ -8702,8 +8708,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9434,7 +9439,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu, balance_cpu = -1;
+ int cpu;
/*
* Ensure the balancing environment is consistent; can happen
@@ -9455,18 +9460,12 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
- balance_cpu = cpu;
- break;
+ /* Are we the first idle CPU? */
+ return cpu == env->dst_cpu;
}
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
-
- /*
- * First idle CPU or the first CPU(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
+ /* Are we the first CPU of this group ? */
+ return group_balance_cpu(sg) == env->dst_cpu;
}
/*
@@ -9819,9 +9818,8 @@ static int active_load_balance_cpu_stop(void *data)
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
}
if (likely(sd)) {
@@ -9910,9 +9908,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
}
max_cost += sd->max_newidle_lb_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
@@ -10034,12 +10029,11 @@ static void kick_ilb(unsigned int flags)
return;
/*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
+ * This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
- smp_send_reschedule(ilb_cpu);
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
@@ -10450,7 +10444,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -10501,9 +10495,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int continue_balancing = 1;
u64 t0, domain_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;