sched: get ride of smt_gainsched-remove-max-capacity

Align numa balancing with smp balancing for detecting if a group of cpus is overloaded or has free capacity. The goal is to remove the smt_gain field and the constraint for capacity of cpus/core in SMT system: 1 < smt_power < 2 which implies that cpus' capacity < SCHED_CAPACITY_SCALE Having the max compute capacity of a cpu in system always set to SCHED_CAPACITY_SCALE enables simplification in the load balance Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
author: Vincent Guittot <vincent.guittot@linaro.org> 2017-04-07 09:46:12 +0200
committer: Vincent Guittot <vincent.guittot@linaro.org> 2017-06-29 17:14:52 +0200
commit: 1b5bafb797497428a4606d64b4863c04febe4a2a (patch)
tree: c91366f42d8354ae4b10d0af8aa045164c197ba3
parent: ff801b716effd652f420204eddb36f6e4a716819 (diff)
4 files changed, 61 insertions, 23 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065abc7a47..9b173e89db85 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -88,7 +88,6 @@ struct sched_domain {
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
-	unsigned int smt_gain;
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..0da9f631415a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1386,23 +1386,70 @@ static unsigned long capacity_of(int cpu);
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
 	unsigned long nr_running;
-	unsigned long load;
+	unsigned long nr_cpus;
+	unsigned long load, util;
 
 	/* Total compute capacity of CPUs on a node */
 	unsigned long compute_capacity;
+	int imbalance_pct;
 
-	/* Approximate capacity in terms of runnable tasks on a node */
-	unsigned long task_capacity;
 	int has_free_capacity;
 };
 
 /*
+ * numa_has_free_capacity returns true if the node has spare capacity that
+ * could be used by some tasks.
+ * We consider that a node has spare capacity if the number of task is
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+numa_has_free_capacity(struct numa_stats *ns)
+{
+	if (ns->nr_running < ns->nr_cpus)
+		return true;
+
+	if ((ns->compute_capacity * 100) >
+			(ns->util * ns->imbalance_pct))
+		return true;
+
+	return false;
+}
+
+static int cpu_util(int cpu);
+
+/*
+ *  numa_is_overloaded returns true if the node has more tasks than it can
+ *  handle.
+ *  numa_is_overloaded is not equals to !numa_has_free_capacity because a node
+ *  with the exact right number of tasks, has no more spare capacity but is not
+ *  overloaded so both numa_has_free_capacity and numa_is_overloaded return
+ *  false.
+ */
+static inline bool
+numa_is_overloaded(struct numa_stats *ns)
+{
+	if (ns->nr_running <= ns->nr_cpus)
+		return false;
+
+	if ((ns->compute_capacity * 100) <
+			(ns->util * ns->imbalance_pct))
+		return true;
+
+	return false;
+}
+
+/*
  * XXX borrowed from update_sg_lb_stats
  */
-static void update_numa_stats(struct numa_stats *ns, int nid)
+static void update_numa_stats(struct numa_stats *ns, int nid, int imbalance_pct)
 {
-	int smt, cpu, cpus = 0;
-	unsigned long capacity;
+	int cpu, cpus = 0;
 
 	memset(ns, 0, sizeof(*ns));
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1410,6 +1457,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 
 		ns->nr_running += rq->nr_running;
 		ns->load += weighted_cpuload(cpu);
+		ns->util += cpu_util(cpu);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -1426,13 +1474,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 	if (!cpus)
 		return;
 
-	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
-	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
-	capacity = cpus / smt; /* cores */
-
-	ns->task_capacity = min_t(unsigned, capacity,
-		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
-	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
+	ns->nr_cpus = cpus;
+	ns->imbalance_pct = imbalance_pct;
+	ns->has_free_capacity = numa_has_free_capacity(ns);
 }
 
 struct task_numa_env {
@@ -1583,7 +1627,7 @@ static void task_numa_compare(struct task_numa_env *env,
 
 	if (!cur) {
 		/* Is there capacity at our destination? */
-		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
+		if (!numa_is_overloaded(&env->src_stats) &&
 		    !env->dst_stats.has_free_capacity)
 			goto unlock;
 
@@ -1738,10 +1782,10 @@ static int task_numa_migrate(struct task_struct *p)
 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
 	taskweight = task_weight(p, env.src_nid, dist);
 	groupweight = group_weight(p, env.src_nid, dist);
-	update_numa_stats(&env.src_stats, env.src_nid);
+	update_numa_stats(&env.src_stats, env.src_nid, env.imbalance_pct);
 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
-	update_numa_stats(&env.dst_stats, env.dst_nid);
+	update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct);
 
 	/* Try to find a spot on the preferred nid. */
 	if (numa_has_capacity(&env))
@@ -1774,7 +1818,7 @@ static int task_numa_migrate(struct task_struct *p)
 
 			env.dist = dist;
 			env.dst_nid = nid;
-			update_numa_stats(&env.dst_stats, env.dst_nid);
+			update_numa_stats(&env.dst_stats, env.dst_nid, env.imbalance_pct);
 			if (numa_has_capacity(&env))
 				task_numa_find_cpu(&env, taskimp, groupimp);
 		}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..ff66e9dc81a5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1665,9 +1665,6 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 static __always_inline
 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-		return sd->smt_gain / sd->span_weight;
-
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..30e751218018 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1132,7 +1132,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
-		.smt_gain		= 0,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
 		.child			= child,
@@ -1158,7 +1157,6 @@ sd_init(struct sched_domain_topology_level *tl,
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 110;
-		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
 		sd->imbalance_pct = 117;
author	Vincent Guittot <vincent.guittot@linaro.org>	2017-04-07 09:46:12 +0200
committer	Vincent Guittot <vincent.guittot@linaro.org>	2017-06-29 17:14:52 +0200
commit	1b5bafb797497428a4606d64b4863c04febe4a2a (patch)
tree	c91366f42d8354ae4b10d0af8aa045164c197ba3
parent	ff801b716effd652f420204eddb36f6e4a716819 (diff)