diff options
author | Vincent Guittot <vincent.guittot@linaro.org> | 2017-04-07 09:46:12 +0200 |
---|---|---|
committer | Vincent Guittot <vincent.guittot@linaro.org> | 2017-06-29 17:14:52 +0200 |
commit | 1b5bafb797497428a4606d64b4863c04febe4a2a (patch) | |
tree | c91366f42d8354ae4b10d0af8aa045164c197ba3 | |
parent | ff801b716effd652f420204eddb36f6e4a716819 (diff) |
sched: get ride of smt_gainsched-remove-max-capacity
Align numa balancing with smp balancing for detecting if a group of cpus
is overloaded or has free capacity.
The goal is to remove the smt_gain field and the constraint for capacity
of cpus/core in SMT system: 1 < smt_power < 2 which implies that
cpus' capacity < SCHED_CAPACITY_SCALE
Having the max compute capacity of a cpu in system always set to
SCHED_CAPACITY_SCALE enables simplification in the load balance
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
-rw-r--r-- | include/linux/sched/topology.h | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 78 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 | ||||
-rw-r--r-- | kernel/sched/topology.c | 2 |
4 files changed, 61 insertions, 23 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7d065abc7a47..9b173e89db85 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -88,7 +88,6 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; - unsigned int smt_gain; int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..0da9f631415a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1386,23 +1386,70 @@ static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long nr_running; - unsigned long load; + unsigned long nr_cpus; + unsigned long load, util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; + int imbalance_pct; - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long task_capacity; int has_free_capacity; }; /* + * numa_has_free_capacity returns true if the node has spare capacity that + * could be used by some tasks. + * We consider that a node has spare capacity if the number of task is + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. + */ +static inline bool +numa_has_free_capacity(struct numa_stats *ns) +{ + if (ns->nr_running < ns->nr_cpus) + return true; + + if ((ns->compute_capacity * 100) > + (ns->util * ns->imbalance_pct)) + return true; + + return false; +} + +static int cpu_util(int cpu); + +/* + * numa_is_overloaded returns true if the node has more tasks than it can + * handle. + * numa_is_overloaded is not equals to !numa_has_free_capacity because a node + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both numa_has_free_capacity and numa_is_overloaded return + * false. + */ +static inline bool +numa_is_overloaded(struct numa_stats *ns) +{ + if (ns->nr_running <= ns->nr_cpus) + return false; + + if ((ns->compute_capacity * 100) < + (ns->util * ns->imbalance_pct)) + return true; + + return false; +} + +/* * XXX borrowed from update_sg_lb_stats */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static void update_numa_stats(struct numa_stats *ns, int nid, int imbalance_pct) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu, cpus = 0; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1410,6 +1457,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); + ns->util += cpu_util(cpu); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -1426,13 +1474,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - ns->task_capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < ns->task_capacity); + ns->nr_cpus = cpus; + ns->imbalance_pct = imbalance_pct; + ns->has_free_capacity = numa_has_free_capacity(ns); } struct task_numa_env { @@ -1583,7 +1627,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.nr_running <= env->src_stats.task_capacity && + if (!numa_is_overloaded(&env->src_stats) && !env->dst_stats.has_free_capacity) goto unlock; @@ -1738,10 +1782,10 @@ static int task_numa_migrate(struct task_struct *p) dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env.src_stats, env.src_nid, env.imbalance_pct); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct); /* Try to find a spot on the preferred nid. */ if (numa_has_capacity(&env)) @@ -1774,7 +1818,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct); if (numa_has_capacity(&env)) task_numa_find_cpu(&env, taskimp, groupimp); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..ff66e9dc81a5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1665,9 +1665,6 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - return SCHED_CAPACITY_SCALE; } #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79895aec281e..30e751218018 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1132,7 +1132,6 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, - .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, .child = child, @@ -1158,7 +1157,6 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; |