aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2017-04-07 09:46:12 +0200
committerVincent Guittot <vincent.guittot@linaro.org>2017-06-29 17:14:52 +0200
commit1b5bafb797497428a4606d64b4863c04febe4a2a (patch)
treec91366f42d8354ae4b10d0af8aa045164c197ba3
parentff801b716effd652f420204eddb36f6e4a716819 (diff)
sched: get ride of smt_gainsched-remove-max-capacity
Align numa balancing with smp balancing for detecting if a group of cpus is overloaded or has free capacity. The goal is to remove the smt_gain field and the constraint for capacity of cpus/core in SMT system: 1 < smt_power < 2 which implies that cpus' capacity < SCHED_CAPACITY_SCALE Having the max compute capacity of a cpu in system always set to SCHED_CAPACITY_SCALE enables simplification in the load balance Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
-rw-r--r--include/linux/sched/topology.h1
-rw-r--r--kernel/sched/fair.c78
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--kernel/sched/topology.c2
4 files changed, 61 insertions, 23 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065abc7a47..9b173e89db85 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -88,7 +88,6 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
- unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..0da9f631415a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1386,23 +1386,70 @@ static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long nr_running;
- unsigned long load;
+ unsigned long nr_cpus;
+ unsigned long load, util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ int imbalance_pct;
- /* Approximate capacity in terms of runnable tasks on a node */
- unsigned long task_capacity;
int has_free_capacity;
};
/*
+ * numa_has_free_capacity returns true if the node has spare capacity that
+ * could be used by some tasks.
+ * We consider that a node has spare capacity if the number of task is
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+numa_has_free_capacity(struct numa_stats *ns)
+{
+ if (ns->nr_running < ns->nr_cpus)
+ return true;
+
+ if ((ns->compute_capacity * 100) >
+ (ns->util * ns->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+static int cpu_util(int cpu);
+
+/*
+ * numa_is_overloaded returns true if the node has more tasks than it can
+ * handle.
+ * numa_is_overloaded is not equals to !numa_has_free_capacity because a node
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both numa_has_free_capacity and numa_is_overloaded return
+ * false.
+ */
+static inline bool
+numa_is_overloaded(struct numa_stats *ns)
+{
+ if (ns->nr_running <= ns->nr_cpus)
+ return false;
+
+ if ((ns->compute_capacity * 100) <
+ (ns->util * ns->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
* XXX borrowed from update_sg_lb_stats
*/
-static void update_numa_stats(struct numa_stats *ns, int nid)
+static void update_numa_stats(struct numa_stats *ns, int nid, int imbalance_pct)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu, cpus = 0;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1410,6 +1457,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
+ ns->util += cpu_util(cpu);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1426,13 +1474,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
if (!cpus)
return;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
-
- ns->task_capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
- ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
+ ns->nr_cpus = cpus;
+ ns->imbalance_pct = imbalance_pct;
+ ns->has_free_capacity = numa_has_free_capacity(ns);
}
struct task_numa_env {
@@ -1583,7 +1627,7 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
+ if (!numa_is_overloaded(&env->src_stats) &&
!env->dst_stats.has_free_capacity)
goto unlock;
@@ -1738,10 +1782,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid, env.imbalance_pct);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct);
/* Try to find a spot on the preferred nid. */
if (numa_has_capacity(&env))
@@ -1774,7 +1818,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct);
if (numa_has_capacity(&env))
task_numa_find_cpu(&env, taskimp, groupimp);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..ff66e9dc81a5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1665,9 +1665,6 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
return SCHED_CAPACITY_SCALE;
}
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..30e751218018 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1132,7 +1132,6 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
- .smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
.child = child,
@@ -1158,7 +1157,6 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;