summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c983
1 files changed, 904 insertions, 79 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 812069b66f47..ae6207c63da5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -34,6 +34,7 @@
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -2600,6 +2601,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
@@ -2760,6 +2762,10 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se))
+ trace_sched_load_avg_task(task_of(se), &se->avg);
+ trace_sched_load_avg_cpu(cpu, cfs_rq);
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2840,27 +2846,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
#else
- last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.last_update_time;
+}
#endif
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
@@ -4177,6 +4201,23 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+static inline unsigned long boosted_cpu_util(int cpu);
+
+static void update_capacity_of(int cpu)
+{
+ unsigned long req_cap;
+
+ if (!sched_freq())
+ return;
+
+ /* Convert scale-invariant capacity to cpu. */
+ req_cap = boosted_cpu_util(cpu);
+ req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+ set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+
+static bool cpu_overutilized(int cpu);
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4187,6 +4228,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+ int task_wakeup = flags & ENQUEUE_WAKEUP;
for_each_sched_entity(se) {
if (se->on_rq)
@@ -4218,9 +4261,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu))
+ rq->rd->overutilized = true;
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are waking up; this is
+ * because we get here also during load balancing, but
+ * in these cases it seems wise to trigger as single
+ * request after load balancing is done.
+ */
+ if (task_new || task_wakeup)
+ update_capacity_of(cpu_of(rq));
+ }
hrtick_update(rq);
}
@@ -4277,9 +4335,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, 1);
+ schedtune_dequeue_task(p, cpu_of(rq));
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are going to sleep;
+ * this is because we get here also during load
+ * balancing, but in these cases it seems wise to
+ * trigger as single request after load balancing is
+ * done.
+ */
+ if (task_sleep) {
+ if (rq->cfs.nr_running)
+ update_capacity_of(cpu_of(rq));
+ else if (sched_freq())
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ }
+ }
hrtick_update(rq);
}
@@ -4506,15 +4580,6 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
@@ -4689,6 +4754,352 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+ int util = __cpu_util(cpu, delta);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->util_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->util_delta;
+ return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+ int i, delta;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+ delta = calc_util_delta(eenv, i);
+ max_util = max(max_util, __cpu_util(i, delta));
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, delta;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_util_delta(eenv, i);
+ util_sum += __cpu_norm_util(i, capacity, delta);
+ }
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy const *sge)
+{
+ int idx;
+ unsigned long util = group_max_util(eenv);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ break;
+ }
+
+ eenv->cap_idx = idx;
+
+ return idx;
+}
+
+static int group_idle_state(struct sched_group *sg)
+{
+ int i, state = INT_MAX;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (!sd)
+ /*
+ * We most probably raced with hotplug; returning a
+ * wrong energy estimation is better than entering an
+ * infinite loop.
+ */
+ return -EINVAL;
+
+ if (sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy;
+ int cap_idx, idle_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->util_delta == 0 &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->util_delta != 0 &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
+ idle_idx = group_idle_state(sg);
+ group_util = group_norm_util(eenv, sg);
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+ * sg->sge->idle_states[idle_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child)
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+next_cpu:
+ continue;
+ }
+
+ eenv->energy = total_energy;
+ return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+#ifdef CONFIG_SCHED_TUNE
+static int energy_diff_evaluate(struct energy_env *eenv)
+{
+ unsigned int boost;
+ int nrg_delta;
+
+ /* Return energy diff when boost margin is 0 */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boost = schedtune_task_boost(eenv->task);
+#else
+ boost = get_sysctl_sched_cfs_boost();
+#endif
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff_evaluate(eenv) eenv->nrg.diff
+#endif
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static int energy_diff(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1, energy_before = 0, energy_after = 0;
+
+ struct energy_env eenv_before = {
+ .util_delta = 0,
+ .src_cpu = eenv->src_cpu,
+ .dst_cpu = eenv->dst_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
+ };
+
+ if (eenv->src_cpu == eenv->dst_cpu)
+ return 0;
+
+ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+ if (!sd)
+ return 0; /* Error */
+
+ sg = sd->groups;
+
+ do {
+ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+ eenv_before.sg_top = eenv->sg_top = sg;
+
+ if (sched_group_energy(&eenv_before))
+ return 0; /* Invalid result abort */
+ energy_before += eenv_before.energy;
+
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
+ if (sched_group_energy(eenv))
+ return 0; /* Invalid result abort */
+ energy_after += eenv->energy;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+
+ return energy_diff_evaluate(eenv);
+}
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
* at a frequency roughly N times higher than one of its wakees. In order
@@ -4779,6 +5190,157 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+static unsigned long
+schedtune_margin(unsigned long signal, unsigned long boost)
+{
+ unsigned long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_LOAD_SCALE - S)
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+
+ /*
+ * Fast integer division by constant:
+ * Constant : (C) = 100
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 100000
+ *
+ * Thus:
+ * Shift bits : ceil(log(R,2)) (S) = 17
+ * Mult const : round(2^S/C) (M) = 1311
+ *
+ *
+ */
+ margin *= 1311;
+ margin >>= 17;
+
+ return margin;
+}
+
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ unsigned int boost;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boost = schedtune_cpu_boost(cpu);
+#else
+ boost = get_sysctl_sched_cfs_boost();
+#endif
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline unsigned long
+schedtune_task_margin(struct task_struct *task)
+{
+ unsigned int boost;
+ unsigned long util;
+ unsigned long margin;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boost = schedtune_task_boost(task);
+#else
+ boost = get_sysctl_sched_cfs_boost();
+#endif
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline unsigned int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util(cpu);
+ unsigned long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ unsigned long margin = schedtune_task_margin(task);
+
+ return util + margin;
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -4788,7 +5350,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *fit_group = NULL, *spare_group = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long fit_capacity = ULONG_MAX;
+ unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -4796,7 +5361,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_capacity;
int local_group;
int i;
@@ -4819,6 +5384,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load = target_load(i, load_idx);
avg_load += load;
+
+ /*
+ * Look for most energy-efficient group that can fit
+ * that can fit the task.
+ */
+ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+ fit_capacity = capacity_of(i);
+ fit_group = group;
+ }
+
+ /*
+ * Look for group which has most spare capacity on a
+ * single cpu.
+ */
+ spare_capacity = capacity_of(i) - cpu_util(i);
+ if (spare_capacity > max_spare_capacity) {
+ max_spare_capacity = spare_capacity;
+ spare_group = group;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -4832,6 +5416,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
} while (group = group->next, group != sd->groups);
+ if (fit_group)
+ return fit_group;
+
+ if (spare_group)
+ return spare_group;
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
@@ -4852,7 +5442,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
+ if (task_fits_spare(p, i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -4864,7 +5454,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ } else if (idle_cpu(i) &&
+ (!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -4873,6 +5464,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+ } else if (shallowest_idle_cpu == -1) {
+ /*
+ * If we haven't found an idle CPU yet
+ * pick a non-idle one that can fit the task as
+ * fallback.
+ */
+ shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -4931,38 +5529,85 @@ done:
return target;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
+static int energy_aware_wake_cpu(struct task_struct *p, int target)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ struct sched_domain *sd;
+ struct sched_group *sg, *sg_target;
+ int target_max_cap = INT_MAX;
+ int target_cpu = task_cpu(p);
+ int i;
+
+ sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+
+ if (!sd)
+ return target;
+
+ sg = sd->groups;
+ sg_target = sg;
+
+ /*
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
+ */
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
+
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
- return (util >= capacity) ? capacity : util;
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ int new_util = cpu_util(i) + boosted_task_util(p);
+
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
+
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+
+ if (target_cpu != task_cpu(p)) {
+ struct energy_env eenv = {
+ .util_delta = task_util(p),
+ .src_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
+ .task = p,
+ };
+
+ /* Not enough spare capacity on previous cpu */
+ if (cpu_overutilized(task_cpu(p)))
+ return target_cpu;
+
+ if (energy_diff(&eenv) >= 0)
+ return task_cpu(p);
+ }
+
+ return target_cpu;
}
/*
@@ -4987,7 +5632,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+ energy_aware();
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -5017,7 +5664,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
if (!sd) {
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+ else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu);
} else while (sd) {
@@ -5333,6 +5982,8 @@ again:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
@@ -5354,9 +6005,12 @@ simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
@@ -5569,6 +6223,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -5587,6 +6248,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
@@ -5597,6 +6259,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
};
@@ -5923,6 +6586,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
+ /*
+ * We want to potentially raise target_cpu's OPP.
+ */
+ update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -5944,6 +6611,11 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
+ /*
+ * We want to potentially raise env.dst_cpu's OPP.
+ */
+ update_capacity_of(env->dst_cpu);
+
raw_spin_unlock(&env->dst_rq->lock);
}
@@ -6039,12 +6711,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -6060,6 +6726,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -6151,19 +6818,57 @@ static unsigned long scale_rt_capacity(int cpu)
used = div_u64(avg, total);
+ /*
+ * deadline bandwidth is defined at system level so we must
+ * weight this bandwidth with the max capacity of the system.
+ * As a reminder, avg_bw is 20bits width and
+ * scale_cpu_capacity is 10 bits width
+ */
+ used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6172,13 +6877,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6191,6 +6897,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -6215,11 +6922,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -6229,12 +6937,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -6329,6 +7041,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+ ref->sgc->max_capacity;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -6339,6 +7063,9 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
@@ -6350,11 +7077,12 @@ group_type group_classify(struct sched_group *group,
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
int i;
@@ -6384,6 +7112,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
+
+ if (cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -6425,9 +7159,25 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
if (sgs->avg_load <= busiest->avg_load)
return false;
+ /*
+ * Candiate sg has no more than one task per cpu and has higher
+ * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -6489,7 +7239,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -6511,7 +7261,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
@@ -6533,6 +7283,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs->group_type = group_classify(sg, sgs);
}
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
+
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
@@ -6549,12 +7308,20 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
- }
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (env->dst_rq->rd->overutilized != overutilized)
+ env->dst_rq->rd->overutilized = overutilized;
+ } else {
+ if (!env->dst_rq->rd->overutilized && overutilized)
+ env->dst_rq->rd->overutilized = true;
+ }
}
/**
@@ -6701,6 +7468,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -6734,6 +7517,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -6775,6 +7563,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -6803,6 +7595,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task) {
+ goto force_balance;
+ }
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -6826,7 +7623,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
@@ -6839,6 +7637,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
force_balance:
+ env->busiest_group_type = busiest->group_type;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return sds.busiest;
@@ -6897,7 +7696,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
@@ -6958,6 +7758,13 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -7079,6 +7886,11 @@ more_balance:
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ if (cur_ld_moved)
+ update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -7170,7 +7982,8 @@ more_balance:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -7311,8 +8124,9 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -7447,8 +8261,13 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ update_capacity_of(env.src_cpu);
+ }
else
schedstat_inc(sd, alb_failed);
}
@@ -7828,12 +8647,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (time_before(now, nohz.next_balance))
return false;
- if (rq->nr_running >= 2)
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
+ if (sd && !energy_aware()) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
@@ -7939,6 +8759,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+ rq->rd->overutilized = true;
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
}
/*