diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 2483 |
1 files changed, 2241 insertions, 242 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 812069b66f47..c193e9b1c38f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,10 +30,13 @@ #include <linux/mempolicy.h> #include <linux/migrate.h> #include <linux/task_work.h> +#include <linux/module.h> #include <trace/events/sched.h> #include "sched.h" +#include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -50,6 +53,16 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; +unsigned int sysctl_sched_cstate_aware = 1; + +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -114,6 +127,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -286,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -656,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -680,18 +739,115 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = sched_freq() ? + * sysctl_sched_initial_task_util : + * scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static void attach_entity_cfs_rq(struct sched_entity *se); + +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + return; + } + } + + attach_entity_cfs_rq(se); +} + #else void init_entity_runnable_average(struct sched_entity *se) { } -#endif +void post_init_entity_util_avg(struct sched_entity *se) +{ +} +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1388,7 +1544,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: assigned = true; @@ -2373,28 +2530,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} + load = scale_load_down(cfs_rq->load.weight); -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2413,6 +2564,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -2431,16 +2583,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) return; + + if (throttled_hierarchy(cfs_rq)) + return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2449,8 +2605,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2600,6 +2757,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; @@ -2675,25 +2833,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } -#ifdef CONFIG_FAIR_GROUP_SCHED /* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_FAIR_GROUP_SCHED +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* @@ -2713,23 +3108,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2740,65 +3155,93 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + return decayed || removed; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); int cpu = cpu_of(rq_of(cfs_rq)); + int decayed; + void *ptr = NULL; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) { +#ifdef CONFIG_SCHED_WALT + ptr = (void *)&(task_of(se)->ravg); +#endif + trace_sched_load_avg_task(task_of(se), &se->avg, ptr); + } } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2806,62 +3249,76 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); + + return last_update_time; +} #else - last_update_time = cfs_rq->avg.last_update_time; +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} #endif +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). + */ +void remove_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -2898,7 +3355,16 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -2962,6 +3428,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -3040,9 +3507,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -3115,6 +3583,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -3150,7 +3628,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } /* @@ -3205,7 +3683,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3321,8 +3799,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); - update_cfs_shares(cfs_rq); + update_load_avg(curr, UPDATE_TG); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4177,6 +4655,28 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); +unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif + +#ifdef CONFIG_SMP +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4187,6 +4687,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif + + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { if (se->on_rq) @@ -4203,6 +4715,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4210,17 +4723,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) add_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + + if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + */ + if (task_new || task_wakeup) + update_capacity_of(cpu_of(rq)); + } + +#endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4250,6 +4805,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4269,17 +4825,50 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) sub_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + + if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4506,15 +5095,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4689,6 +5269,467 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif /* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + int payoff; + struct task_struct *task; + struct { + int before; + int after; + int delta; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; +}; + +/* + * __cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +{ + int util = __cpu_util(cpu, delta); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static int calc_util_delta(struct energy_env *eenv, int cpu) +{ + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) +{ + int i, delta; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct energy_env *eenv, + const struct sched_group_energy * const sge) +{ + int idx; + unsigned long util = group_max_util(eenv); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + break; + } + + eenv->cap_idx = idx; + + return idx; +} + +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) +{ + int i, state = INT_MAX; + int src_in_grp, dst_in_grp; + long grp_util = 0; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_cpus(sg)) + grp_util += cpu_util(i); + + src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + if (src_in_grp == dst_in_grp) { + /* both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + goto end; + } + /* add or remove util as appropriate to indicate what group util + * will be (worst case - no concurrent execution) after moving the task + */ + grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta; + + if (grp_util <= + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + int max_idle_state_idx = sg->sge->nr_idle_states - 2; + int new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + else { + /* for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min(max_idle_state_idx, (int) + (new_state / sg->sgc->max_capacity)); + new_state = max_idle_state_idx - new_state; + } + state = new_state; + } else { + /* After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + state = 0; + } +end: + return state; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct energy_env *eenv) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!eenv->sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (sd && sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + unsigned long group_util; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + eenv->sg_cap = sg_shared_cap; + else + eenv->sg_cap = sg; + + cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + + idle_idx = group_idle_state(eenv, sg); + group_util = group_norm_util(eenv, sg); + + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } + + /* + * If we raced with hotplug and got an sd NULL-pointer; + * returning a wrong energy estimation is better than + * entering an infinite loop. + */ + if (cpumask_test_cpu(cpu, &visit_cpus)) + return -EINVAL; +next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); + continue; + } + + eenv->energy = total_energy; + return 0; +} + +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static inline int __energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0, 0}, + .cap = { 0, 0, 0 }, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; +#ifndef CONFIG_SCHED_TUNE + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); +#endif + /* + * Dead-zone margin preventing too many migrations. + */ + + margin = eenv->nrg.before >> 6; /* ~1.56% */ + + diff = eenv->nrg.after - eenv->nrg.before; + + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; + + return eenv->nrg.diff; +} + +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; +extern bool schedtune_initialized; +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + + /* during early setup, we don't know the extents */ + if (unlikely(!schedtune_initialized)) + return energy_diff < 0 ? -1 : 1 ; + +#ifdef CONFIG_SCHED_DEBUG + { + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); + } +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_CAPACITY_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + int boost = schedtune_task_boost(eenv->task); + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + +/* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened * at a frequency roughly N times higher than one of its wakees. In order @@ -4713,18 +5754,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -4779,6 +5820,149 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif + return p->se.avg.util_avg; +} + +static inline unsigned long boosted_task_util(struct task_struct *task); + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += boosted_task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +#ifdef CONFIG_SCHED_TUNE + +struct reciprocal_value schedtune_spc_rdiv; + +static long +schedtune_margin(unsigned long signal, long boost) +{ + long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_CAPACITY_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + if (boost >= 0) { + margin = SCHED_CAPACITY_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); + + if (boost < 0) + margin *= -1; + return margin; +} + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + int boost = schedtune_cpu_boost(cpu); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +static inline long +schedtune_task_margin(struct task_struct *task) +{ + int boost = schedtune_task_boost(task); + unsigned long util; + long margin; + + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + return 0; +} + +static inline int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + +#endif /* CONFIG_SCHED_TUNE */ + +unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + long margin = schedtune_cpu_margin(util, cpu); + + trace_sched_boost_cpu(cpu, util, margin); + + return util + margin; +} + +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + long margin = schedtune_task_margin(task); + + trace_sched_boost_task(task, util, margin); + + return util + margin; +} + +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4788,7 +5972,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4796,7 +5982,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -4808,8 +5994,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -4819,6 +6009,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -4826,12 +6021,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4850,6 +6066,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { @@ -4889,20 +6109,33 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); - - if (idle_cpu(target)) - return target; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; + + schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts); + schedstat_inc(this_rq(), eas_stats.sis_attempts); + + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle); + schedstat_inc(this_rq(), eas_stats.sis_idle); + return target; + } - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine); + schedstat_inc(this_rq(), eas_stats.sis_cache_affine); + return prev; + } + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -4911,60 +6144,479 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; + + if (i == target && new_usage <= capacity_curr_of(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap); + schedstat_inc(this_rq(), eas_stats.sis_suff_cap); + schedstat_inc(sd, eas_stats.sis_suff_cap); + return target; + } + + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } - target = cpumask_first_and(sched_group_cpus(sg), + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu); + schedstat_inc(this_rq(), eas_stats.sis_idle_cpu); + schedstat_inc(sd, eas_stats.sis_idle_cpu); + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + + if (best_idle_cpu >= 0) + target = best_idle_cpu; + done: + schedstat_inc(p, se.statistics.nr_wakeups_sis_count); + schedstat_inc(this_rq(), eas_stats.sis_count); + return target; } /* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); return (util >= capacity) ? capacity : util; } +static int start_cpu(bool boosted) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + + RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; +} + +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) +{ + unsigned long best_idle_min_cap_orig = ULONG_MAX; + unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + unsigned long best_active_util = ULONG_MAX; + int best_idle_cstate = INT_MAX; + struct sched_domain *sd; + struct sched_group *sg; + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; + + *backup_cpu = -1; + + schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); + schedstat_inc(this_rq(), eas_stats.fbt_attempts); + + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); + if (cpu < 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); + schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); + return -1; + } + + /* Find SD for the start CPU */ + sd = rcu_dereference(per_cpu(sd_ea, cpu)); + if (!sd) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); + schedstat_inc(this_rq(), eas_stats.fbt_no_sd); + return -1; + } + + /* Scan CPUs in all SDs */ + sg = sd->groups; + do { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; + + if (!cpu_online(i)) + continue; + + if (walt_cpu_high_irqload(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + if (new_util > capacity_orig) + continue; + + /* + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to + * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. + */ + if (prefer_idle) { + + /* + * Case A.1: IDLE CPU + * Return the first IDLE CPU we find. + */ + if (idle_cpu(i)) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); + + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); + + return i; + } + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if ((capacity_curr > new_util) && + (capacity_orig - new_util > target_max_spare_cap)) { + target_max_spare_cap = capacity_orig - new_util; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; + } + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) + continue; + + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + /* Keep track of best idle CPU */ + best_idle_min_cap_orig = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) + continue; + + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - new_util) < target_max_spare_cap) + continue; + + target_max_spare_cap = capacity_orig - new_util; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; + } + + } while (sg = sg->next, sg != sd->groups); + + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); + + return target_cpu; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) +{ + struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target, tmp_backup; + bool boosted, prefer_idle; + + schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); + schedstat_inc(this_rq(), eas_stats.secb_attempts); + + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_sync); + schedstat_inc(this_rq(), eas_stats.secb_sync); + return cpu; + } + } + + rcu_read_lock(); +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; +#else + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; +#endif + + sync_entity_load_avg(&p->se); + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + /* Find a cpu with sufficient capacity */ + tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); + + if (!sd) + goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + goto unlock; + } + } + + if (target_cpu != prev_cpu) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(prev_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); + schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); + goto unlock; + } + + if (energy_diff(&eenv) >= 0) { + /* No energy saving for target_cpu, try backup */ + target_cpu = tmp_backup; + eenv.dst_cpu = target_cpu; + if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } + } + + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + goto unlock; + } + + schedstat_inc(p, se.statistics.nr_wakeups_secb_count); + schedstat_inc(this_rq(), eas_stats.secb_count); + +unlock: + rcu_read_unlock(); + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -4987,7 +6639,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu, sync); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5012,47 +6668,65 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; + } else { + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); } - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } + while (sd) { + struct sched_group *group; + int weight; - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ } - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); } - /* while loop will break here if sd == NULL */ } rcu_read_unlock(); @@ -5087,6 +6761,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -5333,6 +7009,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5354,9 +7032,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5569,6 +7250,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -5587,6 +7275,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -5597,6 +7286,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -5778,7 +7468,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* @@ -5923,6 +7615,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -5944,6 +7640,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -5966,8 +7667,13 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, + true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6027,7 +7733,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6039,12 +7745,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6060,6 +7760,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6151,19 +7852,58 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6172,13 +7912,15 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6191,6 +7933,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6215,11 +7959,13 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -6229,12 +7975,18 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -6329,6 +8081,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6339,9 +8103,44 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing data + * - used by the nohz balance, but we want it available here + * so that we can see which CPUs have no tick. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +static inline void update_cpu_stats_if_tickless(struct rq *rq) +{ + /* only called from update_sg_lb_stats when irqs are disabled */ + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) { + /* rate limit updates to once-per-jiffie at most */ + if (READ_ONCE(jiffies) <= rq->last_load_update_tick) + return; + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false); + raw_spin_unlock(&rq->lock); + } +} + +#else +static inline void update_cpu_stats_if_tickless(struct rq *rq) { } +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -6350,20 +8149,27 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + /* if we are entering idle and there are CPUs with + * their tick stopped, do an update for them + */ + if (env->idle == CPU_NEWLY_IDLE) + update_cpu_stats_if_tickless(rq); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -6374,7 +8180,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6382,8 +8189,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) { + *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -6425,9 +8241,31 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6478,6 +8316,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -6489,7 +8330,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6511,7 +8352,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6533,6 +8374,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -6549,10 +8399,23 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); - if (!env->sd->parent) { + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; + + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) { + env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } + } else { + if (!env->dst_rq->rd->overutilized && overutilized) { + env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } } @@ -6701,6 +8564,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -6734,6 +8613,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -6775,6 +8659,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -6803,6 +8691,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -6826,7 +8719,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -6839,6 +8733,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -6897,7 +8792,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* @@ -6958,6 +8854,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -7006,7 +8909,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -7079,6 +8982,11 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -7170,7 +9078,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -7302,6 +9211,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -7311,8 +9221,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -7324,6 +9235,17 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -7388,6 +9310,12 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; @@ -7447,8 +9375,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } @@ -7477,12 +9410,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -7828,12 +9755,17 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) + return true; + + /* Do idle load balance if there have misfit task */ + if (energy_aware() && rq->misfit_task) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); @@ -7939,6 +9871,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + +#ifdef CONFIG_SMP + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + + rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* @@ -8040,6 +9982,61 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(se, 0); + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -8054,8 +10051,7 @@ static void detach_task_cfs_rq(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } - /* Catch up with the cfs_rq and remove our load when we leave */ - detach_entity_load_avg(cfs_rq, se); + detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8063,16 +10059,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - - /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8126,6 +10113,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -8166,8 +10156,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8182,6 +10173,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8195,6 +10188,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8283,8 +10280,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } |