4 files changed, 381 insertions, 53 deletions
diff --git a/Documentation/arm/small_task_packing.txt b/Documentation/arm/small_task_packing.txt
new file mode 100644
index 000000000000..43f0a8b80234
--- /dev/null
+++ b/Documentation/arm/small_task_packing.txt
@@ -0,0 +1,136 @@
+Small Task Packing in the big.LITTLE MP Reference Patch Set
+
+What is small task packing?
+----
+Simply that the scheduler will fit as many small tasks on a single CPU
+as possible before using other CPUs. A small task is defined as one
+whose tracked load is less than 90% of a NICE_0 task. This is a change
+from the usual behavior since the scheduler will normally use an idle
+CPU for a waking task unless that task is considered cache hot.
+
+
+How is it implemented?
+----
+Since all small tasks must wake up relatively frequently, the main
+requirement for packing small tasks is to select a partly-busy CPU when
+waking rather than looking for an idle CPU. We use the tracked load of
+the CPU runqueue to determine how heavily loaded each CPU is and the
+tracked load of the task to determine if it will fit on the CPU. We
+always start with the lowest-numbered CPU in a sched domain and stop
+looking when we find a CPU with enough space for the task.
+
+Some further tweaks are necessary to suppress load balancing when the
+CPU is not fully loaded, otherwise the scheduler attempts to spread
+tasks evenly across the domain.
+
+
+How does it interact with the HMP patches?
+----
+Firstly, we only enable packing on the little domain. The intent is that
+the big domain is intended to spread tasks amongst the available CPUs
+one-task-per-CPU. The little domain however is attempting to use as
+little power as possible while servicing its tasks.
+
+Secondly, since we offload big tasks onto little CPUs in order to try
+to devote one CPU to each task, we have a threshold above which we do
+not try to pack a task and instead will select an idle CPU if possible.
+This maintains maximum forward progress for busy tasks temporarily
+demoted from big CPUs.
+
+
+Can the behaviour be tuned?
+----
+Yes, the load level of a 'full' CPU can be easily modified in the source
+and is exposed through sysfs as /sys/kernel/hmp/packing_limit to be
+changed at runtime. The presence of the packing behaviour is controlled
+by CONFIG_SCHED_HMP_LITTLE_PACKING and can be disabled at run-time
+using /sys/kernel/hmp/packing_enable.
+The definition of a small task is hard coded as 90% of NICE_0_LOAD
+and cannot be modified at run time.
+
+
+Why do I need to tune it?
+----
+The optimal configuration is likely to be different depending upon the
+design and manufacturing of your SoC.
+
+In the main, there are two system effects from enabling small task
+packing.
+
+1. CPU operating point may increase
+2. wakeup latency of tasks may be increased
+
+There are also likely to be secondary effects from loading one CPU
+rather than spreading tasks.
+
+Note that all of these system effects are dependent upon the workload
+under consideration.
+
+
+CPU Operating Point
+----
+The primary impact of loading one CPU with a number of light tasks is to
+increase the compute requirement of that CPU since it is no longer idle
+as often. Increased compute requirement causes an increase in the
+frequency of the CPU through CPUfreq.
+
+Consider this example:
+We have a system with 3 CPUs which can operate at any frequency between
+350MHz and 1GHz. The system has 6 tasks which would each produce 10%
+load at 1GHz. The scheduler has frequency-invariant load scaling
+enabled. Our DVFS governor aims for 80% utilization at the chosen
+frequency.
+
+Without task packing, these tasks will be spread out amongst all CPUs
+such that each has 2. This will produce roughly 20% system load, and
+the frequency of the package will remain at 350MHz.
+
+With task packing set to the default packing_limit, all of these tasks
+will sit on one CPU and require a package frequency of ~750MHz to reach
+80% utilization. (0.75 = 0.6 * 0.8).
+
+When a package operates on a single frequency domain, all CPUs in that
+package share frequency and voltage.
+
+Depending upon the SoC implementation there can be a significant amount
+of energy lost to leakage from idle CPUs. The decision about how
+loaded a CPU must be to be considered 'full' is therefore controllable
+through sysfs (sys/kernel/hmp/packing_limit) and directly in the code.
+
+Continuing the example, lets set packing_limit to 450 which means we
+will pack tasks until the total load of all running tasks >= 450. In
+practise, this is very similar to a 55% idle 1Ghz CPU.
+
+Now we are only able to place 4 tasks on CPU0, and two will overflow
+onto CPU1. CPU0 will have a load of 40% and CPU1 will have a load of
+20%. In order to still hit 80% utilization, CPU0 now only needs to
+operate at (0.4*0.8=0.32) 320MHz, which means that the lowest operating
+point will be selected, the same as in the non-packing case, except that
+now CPU2 is no longer needed and can be power-gated.
+
+In order to use less energy, the saving from power-gating CPU2 must be
+more than the energy spent running CPU0 for the extra cycles. This
+depends upon the SoC implementation.
+
+This is obviously a contrived example requiring all the tasks to
+be runnable at the same time, but it illustrates the point.
+
+
+Wakeup Latency
+----
+This is an unavoidable consequence of trying to pack tasks together
+rather than giving them a CPU each. If you cannot find an acceptable
+level of wakeup latency, you should turn packing off.
+
+Cyclictest is a good test application for determining the added latency
+when configuring packing.
+
+
+Why is it turned off for the VersatileExpress V2P_CA15A7 CoreTile?
+----
+Simply, this core tile only has power gating for the whole A7 package.
+When small task packing is enabled, all our low-energy use cases
+normally fit onto one A7 CPU. We therefore end up with 2 mostly-idle
+CPUs and one mostly-busy CPU. This decreases the amount of time
+available where the whole package is idle and can be turned off.
+
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 426e541171ff..1116be551be5 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1510,6 +1510,17 @@ config SCHED_HMP
 	  There is currently no support for migration of task groups, hence
 	  !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled
 	  between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE).
+	  When turned on, this option adds sys/kernel/hmp directory which
+	  contains the following files:
+	  up_threshold - the load average threshold used for up migration
+	                 (0 - 1023)
+	  down_threshold - the load average threshold used for down migration
+	                 (0 - 1023)
+	  hmp_domains - a list of cpumasks for the present HMP domains,
+	                starting with the 'biggest' and ending with the
+	                'smallest'.
+	  Note that both the threshold files can be written at runtime to
+	  control scheduler behaviour.
 
 config SCHED_HMP_PRIO_FILTER
 	bool "(EXPERIMENTAL) Filter HMP migrations by task priority"
@@ -1544,28 +1555,24 @@ config HMP_VARIABLE_SCALE
 	bool "Allows changing the load tracking scale through sysfs"
 	depends on SCHED_HMP
 	help
-	  When turned on, this option exports the thresholds and load average
-	  period value for the load tracking patches through sysfs.
+	  When turned on, this option exports the load average period value
+	  for the load tracking patches through sysfs.
 	  The values can be modified to change the rate of load accumulation
-	  and the thresholds used for HMP migration.
-	  The load_avg_period_ms is the time in ms to reach a load average of
-	  0.5 for an idle task of 0 load average ratio that start a busy loop.
-	  The up_threshold and down_threshold is the value to go to a faster
-	  CPU or to go back to a slower cpu.
-	  The {up,down}_threshold are devided by 1024 before being compared
-	  to the load average.
-	  For examples, with load_avg_period_ms = 128 and up_threshold = 512,
+	  used for HMP migration. 'load_avg_period_ms' is the time in ms to
+	  reach a load average of 0.5 for an idle task of 0 load average
+	  ratio which becomes 100% busy.
+	  For example, with load_avg_period_ms = 128 and up_threshold = 512,
 	  a running task with a load of 0 will be migrated to a bigger CPU after
 	  128ms, because after 128ms its load_avg_ratio is 0.5 and the real
 	  up_threshold is 0.5.
 	  This patch has the same behavior as changing the Y of the load
 	  average computation to
 	        (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
-	  but it remove intermadiate overflows in computation.
+	  but removes intermediate overflows in computation.
 
 config HMP_FREQUENCY_INVARIANT_SCALE
 	bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
-	depends on HMP_VARIABLE_SCALE && CPU_FREQ
+	depends on SCHED_HMP && CPU_FREQ
 	help
 	  Scales the current load contribution in line with the frequency
 	  of the CPU that the task was executed on.
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 66dc53bca19a..2afcb71857fd 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -580,6 +580,55 @@ TRACE_EVENT(sched_task_usage_ratio,
 );
 
 /*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations,
+ * marking the forced transition of runnable or running tasks.
+ */
+TRACE_EVENT(sched_hmp_migrate_force_running,
+
+	TP_PROTO(struct task_struct *tsk, int running),
+
+	TP_ARGS(tsk, running),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(int, running)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->running = running;
+	),
+
+	TP_printk("running=%d comm=%s",
+		__entry->running, __entry->comm)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations,
+ * marking the forced transition of runnable or running
+ * tasks when a task is about to go idle.
+ */
+TRACE_EVENT(sched_hmp_migrate_idle_running,
+
+	TP_PROTO(struct task_struct *tsk, int running),
+
+	TP_ARGS(tsk, running),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(int, running)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->running = running;
+	),
+
+	TP_printk("running=%d comm=%s",
+		__entry->running, __entry->comm)
+);
+
+/*
  * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
  */
 #define HMP_MIGRATE_WAKEUP 0
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 643da90f3a7a..22913a60001d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -31,7 +31,6 @@
 #include <linux/task_work.h>
 
 #include <trace/events/sched.h>
-#ifdef CONFIG_HMP_VARIABLE_SCALE
 #include <linux/sysfs.h>
 #include <linux/vmalloc.h>
 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
@@ -40,7 +39,6 @@
  */
 #include <linux/cpufreq.h>
 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
-#endif /* CONFIG_HMP_VARIABLE_SCALE */
 
 #include "sched.h"
 
@@ -1212,8 +1210,6 @@ static u32 __compute_runnable_contrib(u64 n)
 	return contrib + runnable_avg_yN_sum[n];
 }
 
-#ifdef CONFIG_HMP_VARIABLE_SCALE
-
 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
 struct hmp_global_attr {
 	struct attribute attr;
@@ -1224,6 +1220,7 @@ struct hmp_global_attr {
 	int *value;
 	int (*to_sysfs)(int);
 	int (*from_sysfs)(int);
+	ssize_t (*to_sysfs_text)(char *buf, int buf_size);
 };
 
 #define HMP_DATA_SYSFS_MAX 8
@@ -1294,7 +1291,6 @@ struct cpufreq_extents {
 
 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
-#endif /* CONFIG_HMP_VARIABLE_SCALE */
 
 /* We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -1340,9 +1336,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
 
 	delta = now - sa->last_runnable_update;
-#ifdef CONFIG_HMP_VARIABLE_SCALE
+
 	delta = hmp_variable_scale_convert(delta);
-#endif
 	/*
 	 * This should only happen when time goes backwards, which it
 	 * unfortunately does during sched clock init when we swap over to TSC.
@@ -3843,7 +3838,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
 	cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
 }
 
-#ifdef CONFIG_HMP_VARIABLE_SCALE
 /*
  * Heterogenous multiprocessor (HMP) optimizations
  *
@@ -3876,27 +3870,35 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
  * The scale factor hmp_data.multiplier is a fixed point
  * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
  */
-static u64 hmp_variable_scale_convert(u64 delta)
+static inline u64 hmp_variable_scale_convert(u64 delta)
 {
+#ifdef CONFIG_HMP_VARIABLE_SCALE
 	u64 high = delta >> 32ULL;
 	u64 low = delta & 0xffffffffULL;
 	low *= hmp_data.multiplier;
 	high *= hmp_data.multiplier;
 	return (low >> HMP_VARIABLE_SCALE_SHIFT)
 			+ (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+#else
+	return delta;
+#endif
 }
 
 static ssize_t hmp_show(struct kobject *kobj,
 				struct attribute *attr, char *buf)
 {
-	ssize_t ret = 0;
 	struct hmp_global_attr *hmp_attr =
 		container_of(attr, struct hmp_global_attr, attr);
-	int temp = *(hmp_attr->value);
+	int temp;
+
+	if (hmp_attr->to_sysfs_text != NULL)
+		return hmp_attr->to_sysfs_text(buf, PAGE_SIZE);
+
+	temp = *(hmp_attr->value);
 	if (hmp_attr->to_sysfs != NULL)
 		temp = hmp_attr->to_sysfs(temp);
-	ret = sprintf(buf, "%d\n", temp);
-	return ret;
+
+	return (ssize_t)sprintf(buf, "%d\n", temp);
 }
 
 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
@@ -3925,11 +3927,31 @@ static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
 	return ret;
 }
 
+static ssize_t hmp_print_domains(char *outbuf, int outbufsize)
+{
+	char buf[64];
+	const char nospace[] = "%s", space[] = " %s";
+	const char *fmt = nospace;
+	struct hmp_domain *domain;
+	struct list_head *pos;
+	int outpos = 0;
+	list_for_each(pos, &hmp_domains) {
+		domain = list_entry(pos, struct hmp_domain, hmp_domains);
+		if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) {
+			outpos += sprintf(outbuf+outpos, fmt, buf);
+			fmt = space;
+		}
+	}
+	strcat(outbuf, "\n");
+	return outpos+1;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
 static int hmp_period_tofrom_sysfs(int value)
 {
 	return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
 }
-
+#endif
 /* max value for threshold is 1024 */
 static int hmp_theshold_from_sysfs(int value)
 {
@@ -3937,9 +3959,10 @@ static int hmp_theshold_from_sysfs(int value)
 		return -1;
 	return value;
 }
-#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
-/* freqinvar control is only 0,1 off/on */
-static int hmp_freqinvar_from_sysfs(int value)
+#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \
+		defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)
+/* toggle control is only 0,1 off/on */
+static int hmp_toggle_from_sysfs(int value)
 {
 	if (value < 0 || value > 1)
 		return -1;
@@ -3959,7 +3982,9 @@ static void hmp_attr_add(
 	const char *name,
 	int *value,
 	int (*to_sysfs)(int),
-	int (*from_sysfs)(int))
+	int (*from_sysfs)(int),
+	ssize_t (*to_sysfs_text)(char *, int),
+	umode_t mode)
 {
 	int i = 0;
 	while (hmp_data.attributes[i] != NULL) {
@@ -3967,13 +3992,17 @@ static void hmp_attr_add(
 		if (i >= HMP_DATA_SYSFS_MAX)
 			return;
 	}
-	hmp_data.attr[i].attr.mode = 0644;
+	if (mode)
+		hmp_data.attr[i].attr.mode = mode;
+	else
+		hmp_data.attr[i].attr.mode = 0644;
 	hmp_data.attr[i].show = hmp_show;
 	hmp_data.attr[i].store = hmp_store;
 	hmp_data.attr[i].attr.name = name;
 	hmp_data.attr[i].value = value;
 	hmp_data.attr[i].to_sysfs = to_sysfs;
 	hmp_data.attr[i].from_sysfs = from_sysfs;
+	hmp_data.attr[i].to_sysfs_text = to_sysfs_text;
 	hmp_data.attributes[i] = &hmp_data.attr[i].attr;
 	hmp_data.attributes[i + 1] = NULL;
 }
@@ -3982,40 +4011,59 @@ static int hmp_attr_init(void)
 {
 	int ret;
 	memset(&hmp_data, sizeof(hmp_data), 0);
+	hmp_attr_add("hmp_domains",
+		NULL,
+		NULL,
+		NULL,
+		hmp_print_domains,
+		0444);
+	hmp_attr_add("up_threshold",
+		&hmp_up_threshold,
+		NULL,
+		hmp_theshold_from_sysfs,
+		NULL,
+		0);
+	hmp_attr_add("down_threshold",
+		&hmp_down_threshold,
+		NULL,
+		hmp_theshold_from_sysfs,
+		NULL,
+		0);
+#ifdef CONFIG_HMP_VARIABLE_SCALE
 	/* by default load_avg_period_ms == LOAD_AVG_PERIOD
 	 * meaning no change
 	 */
 	hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
-
 	hmp_attr_add("load_avg_period_ms",
 		&hmp_data.multiplier,
 		hmp_period_tofrom_sysfs,
-		hmp_period_tofrom_sysfs);
-	hmp_attr_add("up_threshold",
-		&hmp_up_threshold,
-		NULL,
-		hmp_theshold_from_sysfs);
-	hmp_attr_add("down_threshold",
-		&hmp_down_threshold,
+		hmp_period_tofrom_sysfs,
 		NULL,
-		hmp_theshold_from_sysfs);
+		0);
+#endif
 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
 	/* default frequency-invariant scaling ON */
 	hmp_data.freqinvar_load_scale_enabled = 1;
 	hmp_attr_add("frequency_invariant_load_scale",
 		&hmp_data.freqinvar_load_scale_enabled,
 		NULL,
-		hmp_freqinvar_from_sysfs);
+		hmp_toggle_from_sysfs,
+		NULL,
+		0);
 #endif
 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
 	hmp_attr_add("packing_enable",
 		&hmp_packing_enabled,
 		NULL,
-		hmp_freqinvar_from_sysfs);
+		hmp_toggle_from_sysfs,
+		NULL,
+		0);
 	hmp_attr_add("packing_limit",
 		&hmp_full_threshold,
 		NULL,
-		hmp_packing_from_sysfs);
+		hmp_packing_from_sysfs,
+		NULL,
+		0);
 #endif
 	hmp_data.attr_group.name = "hmp";
 	hmp_data.attr_group.attrs = hmp_data.attributes;
@@ -4024,7 +4072,6 @@ static int hmp_attr_init(void)
 	return 0;
 }
 late_initcall(hmp_attr_init);
-#endif /* CONFIG_HMP_VARIABLE_SCALE */
 /*
  * return the load of the lowest-loaded CPU in a given HMP domain
  * min_cpu optionally points to an int to receive the CPU.
@@ -6915,6 +6962,69 @@ out_unlock:
 	return 0;
 }
 
+/*
+ * Move task in a runnable state to another CPU.
+ *
+ * Tailored on 'active_load_balance_stop_cpu' with slight
+ * modification to locking and pre-transfer checks.  Note
+ * rq->lock must be held before calling.
+ */
+static void hmp_migrate_runnable_task(struct rq *rq)
+{
+	struct sched_domain *sd;
+	int src_cpu = cpu_of(rq);
+	struct rq *src_rq = rq;
+	int dst_cpu = rq->push_cpu;
+	struct rq *dst_rq = cpu_rq(dst_cpu);
+	struct task_struct *p = rq->migrate_task;
+	/*
+	 * One last check to make sure nobody else is playing
+	 * with the source rq.
+	 */
+	if (src_rq->active_balance)
+		return;
+
+	if (src_rq->nr_running <= 1)
+		return;
+
+	if (task_rq(p) != src_rq)
+		return;
+	/*
+	 * Not sure if this applies here but one can never
+	 * be too cautious
+	 */
+	BUG_ON(src_rq == dst_rq);
+
+	double_lock_balance(src_rq, dst_rq);
+
+	rcu_read_lock();
+	for_each_domain(dst_cpu, sd) {
+		if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
+			break;
+	}
+
+	if (likely(sd)) {
+		struct lb_env env = {
+			.sd             = sd,
+			.dst_cpu        = dst_cpu,
+			.dst_rq         = dst_rq,
+			.src_cpu        = src_cpu,
+			.src_rq         = src_rq,
+			.idle           = CPU_IDLE,
+		};
+
+		schedstat_inc(sd, alb_count);
+
+		if (move_specific_task(&env, p))
+			schedstat_inc(sd, alb_pushed);
+		else
+			schedstat_inc(sd, alb_failed);
+	}
+
+	rcu_read_unlock();
+	double_unlock_balance(src_rq, dst_rq);
+}
+
 static DEFINE_SPINLOCK(hmp_force_migration);
 
 /*
@@ -6927,13 +7037,14 @@ static void hmp_force_up_migration(int this_cpu)
 	struct sched_entity *curr, *orig;
 	struct rq *target;
 	unsigned long flags;
-	unsigned int force;
+	unsigned int force, got_target;
 	struct task_struct *p;
 
 	if (!spin_trylock(&hmp_force_migration))
 		return;
 	for_each_online_cpu(cpu) {
 		force = 0;
+		got_target = 0;
 		target = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&target->lock, flags);
 		curr = target->cfs.curr;
@@ -6956,15 +7067,14 @@ static void hmp_force_up_migration(int this_cpu)
 		if (hmp_up_migration(cpu, &target_cpu, curr)) {
 			if (!target->active_balance) {
 				get_task_struct(p);
-				target->active_balance = 1;
 				target->push_cpu = target_cpu;
 				target->migrate_task = p;
-				force = 1;
+				got_target = 1;
 				trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
 				hmp_next_up_delay(&p->se, target->push_cpu);
 			}
 		}
-		if (!force && !target->active_balance) {
+		if (!got_target && !target->active_balance) {
 			/*
 			 * For now we just check the currently running task.
 			 * Selecting the lightest task for offloading will
@@ -6975,14 +7085,29 @@ static void hmp_force_up_migration(int this_cpu)
 			target->push_cpu = hmp_offload_down(cpu, curr);
 			if (target->push_cpu < NR_CPUS) {
 				get_task_struct(p);
-				target->active_balance = 1;
 				target->migrate_task = p;
-				force = 1;
+				got_target = 1;
 				trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
 				hmp_next_down_delay(&p->se, target->push_cpu);
 			}
 		}
+		/*
+		 * We have a target with no active_balance.  If the task
+		 * is not currently running move it, otherwise let the
+		 * CPU stopper take care of it.
+		 */
+		if (got_target && !target->active_balance) {
+			if (!task_running(target, p)) {
+				trace_sched_hmp_migrate_force_running(p, 0);
+				hmp_migrate_runnable_task(target);
+			} else {
+				target->active_balance = 1;
+				force = 1;
+			}
+		}
+
 		raw_spin_unlock_irqrestore(&target->lock, flags);
+
 		if (force)
 			stop_one_cpu_nowait(cpu_of(target),
 				hmp_active_task_migration_cpu_stop,
@@ -7002,7 +7127,7 @@ static unsigned int hmp_idle_pull(int this_cpu)
 	int cpu;
 	struct sched_entity *curr, *orig;
 	struct hmp_domain *hmp_domain = NULL;
-	struct rq *target, *rq;
+	struct rq *target = NULL, *rq;
 	unsigned long flags, ratio = 0;
 	unsigned int force = 0;
 	struct task_struct *p = NULL;
@@ -7054,14 +7179,25 @@ static unsigned int hmp_idle_pull(int this_cpu)
 	raw_spin_lock_irqsave(&target->lock, flags);
 	if (!target->active_balance && task_rq(p) == target) {
 		get_task_struct(p);
-		target->active_balance = 1;
 		target->push_cpu = this_cpu;
 		target->migrate_task = p;
-		force = 1;
 		trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
 		hmp_next_up_delay(&p->se, target->push_cpu);
+		/*
+		 * if the task isn't running move it right away.
+		 * Otherwise setup the active_balance mechanic and let
+		 * the CPU stopper do its job.
+		 */
+		if (!task_running(target, p)) {
+			trace_sched_hmp_migrate_idle_running(p, 0);
+			hmp_migrate_runnable_task(target);
+		} else {
+			target->active_balance = 1;
+			force = 1;
+		}
 	}
 	raw_spin_unlock_irqrestore(&target->lock, flags);
+
 	if (force) {
 		stop_one_cpu_nowait(cpu_of(target),
 			hmp_idle_pull_cpu_stop,