summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Konovalov <andrey.konovalov@linaro.org>2012-08-16 22:43:43 +0400
committerAndrey Konovalov <andrey.konovalov@linaro.org>2012-08-16 22:43:43 +0400
commitfb7ef00ee55d7a84727dd1dd79d3024780735cb7 (patch)
tree17f50c0ca0ff3665d47a70efedbd2c20611deb96
parent97313f534c6a72a9c0621c9f6288feb3bd9d3e4c (diff)
parent6a1b927448144fe2fa7e28949cc54ea71bbe69d1 (diff)
Merge branch 'tracking-big-LITTLE-MP-v5' into merge-linux-linaro-core-trackingste-lt-next.old
-rw-r--r--arch/arm/Kconfig29
-rw-r--r--arch/arm/kernel/topology.c69
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c253
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.h6
-rw-r--r--include/linux/kthread.h11
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/smpboot.h43
-rw-r--r--include/trace/events/sched.h151
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/kthread.c185
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/rcutree.c12
-rw-r--r--kernel/rcutree.h15
-rw-r--r--kernel/rcutree_plugin.h403
-rw-r--r--kernel/rcutree_trace.c3
-rw-r--r--kernel/sched/core.c5
-rw-r--r--kernel/sched/debug.c39
-rw-r--r--kernel/sched/fair.c1097
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/sched.h60
-rw-r--r--kernel/smpboot.c233
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c108
-rw-r--r--kernel/watchdog.c280
-rw-r--r--linaro/configs/big-LITTLE-MP.conf9
26 files changed, 2039 insertions, 1021 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 337e356b7c6..f6386f54e25 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1597,6 +1597,35 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+ bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+ help
+ Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+ bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+ depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+ help
+ Experimental scheduler optimizations for heterogeneous platforms.
+ Attempts introspectively select task affinity to optimize power
+ and performance. Currently support two types of CPUs: fast
+ (high-performance) and slow (power-efficient). There is currently
+ no support for migration of task groups, hence !SCHED_AUTOGROUP.
+
+config HMP_FAST_CPU_MASK
+ string "HMP scheduler fast CPU mask"
+ depends on SCHED_HMP
+ help
+ Specifies the cpuids of the fast CPUs in the system as a list
+ string, e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+ string "HMP scheduler slow CPU mask"
+ depends on SCHED_HMP
+ help
+ Specifies the cpuids of the slow CPUs in the system as a list
+ string, e.g. cpuid 0+1 should be specified as 0-1.
+
config HAVE_ARM_SCU
bool
help
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 198b08456e9..1f644acba43 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -317,6 +317,75 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+
+#ifdef CONFIG_SCHED_HMP
+
+static const char * const little_cores[] = {
+ "arm,cortex-a7",
+ NULL,
+};
+
+static bool is_little_cpu(struct device_node *cn)
+{
+ const char * const *lc;
+ for (lc = little_cores; *lc; lc++)
+ if (of_device_is_compatible(cn, *lc))
+ return true;
+ return false;
+}
+
+void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+ struct cpumask *slow)
+{
+ struct device_node *cn = NULL;
+ int cpu = 0;
+
+ cpumask_clear(fast);
+ cpumask_clear(slow);
+
+ /*
+ * Use the config options if they are given. This helps testing
+ * HMP scheduling on systems without a big.LITTLE architecture.
+ */
+ if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+ if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast))
+ WARN(1, "Failed to parse HMP fast cpu mask!\n");
+ if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow))
+ WARN(1, "Failed to parse HMP slow cpu mask!\n");
+ return;
+ }
+
+ /*
+ * Else, parse device tree for little cores.
+ */
+ while ((cn = of_find_node_by_type(cn, "cpu"))) {
+
+ if (cpu >= num_possible_cpus())
+ break;
+
+ if (is_little_cpu(cn))
+ cpumask_set_cpu(cpu, slow);
+ else
+ cpumask_set_cpu(cpu, fast);
+
+ cpu++;
+ }
+
+ if (!cpumask_empty(fast) && !cpumask_empty(slow))
+ return;
+
+ /*
+ * We didn't find both big and little cores so let's call all cores
+ * fast as this will keep the system running, with all cores being
+ * treated equal.
+ */
+ cpumask_setall(fast);
+ cpumask_clear(slow);
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
+
/*
* init_cpu_topology is called at boot when only one cpu is running
* which prevent simultaneous write access to cpu_topology array
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 53589000fd0..a4c9f9d4552 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
*/
#include <linux/slab.h>
+#include <linux/smpboot.h>
#include "ehca_classes.h"
#include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
ehca_process_eq((struct ehca_shca*)data, 1);
}
-static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
{
int cpu;
unsigned long flags;
@@ -662,17 +663,23 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
ehca_dmp(cpu_online_mask, cpumask_size(), "");
spin_lock_irqsave(&pool->last_cpu_lock, flags);
- cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
- pool->last_cpu = cpu;
+ while (1) {
+ cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_online_mask);
+ pool->last_cpu = cpu;
+ /* Might be on the way out */
+ if (per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active)
+ break;
+ }
spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
return cpu;
}
static void __queue_comp_task(struct ehca_cq *__cq,
- struct ehca_cpu_comp_task *cct)
+ struct ehca_cpu_comp_task *cct,
+ struct task_struct *thread)
{
unsigned long flags;
@@ -683,7 +690,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
__cq->nr_callbacks++;
list_add_tail(&__cq->entry, &cct->cq_list);
cct->cq_jobs++;
- wake_up(&cct->wait_queue);
+ wake_up_process(thread);
} else
__cq->nr_callbacks++;
@@ -695,6 +702,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
{
int cpu_id;
struct ehca_cpu_comp_task *cct;
+ struct task_struct *thread;
int cq_jobs;
unsigned long flags;
@@ -702,7 +710,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
BUG_ON(!cpu_online(cpu_id));
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
- BUG_ON(!cct);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
spin_lock_irqsave(&cct->task_lock, flags);
cq_jobs = cct->cq_jobs;
@@ -710,28 +719,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
if (cq_jobs > 0) {
cpu_id = find_next_online_cpu(pool);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
- BUG_ON(!cct);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
}
-
- __queue_comp_task(__cq, cct);
+ __queue_comp_task(__cq, cct, thread);
}
static void run_comp_task(struct ehca_cpu_comp_task *cct)
{
struct ehca_cq *cq;
- unsigned long flags;
-
- spin_lock_irqsave(&cct->task_lock, flags);
while (!list_empty(&cct->cq_list)) {
cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
- spin_unlock_irqrestore(&cct->task_lock, flags);
+ spin_unlock_irq(&cct->task_lock);
comp_event_callback(cq);
if (atomic_dec_and_test(&cq->nr_events))
wake_up(&cq->wait_completion);
- spin_lock_irqsave(&cct->task_lock, flags);
+ spin_lock_irq(&cct->task_lock);
spin_lock(&cq->task_lock);
cq->nr_callbacks--;
if (!cq->nr_callbacks) {
@@ -740,159 +746,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
}
spin_unlock(&cq->task_lock);
}
-
- spin_unlock_irqrestore(&cct->task_lock, flags);
}
-static int comp_task(void *__cct)
+static void comp_task_park(unsigned int cpu)
{
- struct ehca_cpu_comp_task *cct = __cct;
- int cql_empty;
- DECLARE_WAITQUEUE(wait, current);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- add_wait_queue(&cct->wait_queue, &wait);
-
- spin_lock_irq(&cct->task_lock);
- cql_empty = list_empty(&cct->cq_list);
- spin_unlock_irq(&cct->task_lock);
- if (cql_empty)
- schedule();
- else
- __set_current_state(TASK_RUNNING);
-
- remove_wait_queue(&cct->wait_queue, &wait);
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ struct ehca_cpu_comp_task *target;
+ struct task_struct *thread;
+ struct ehca_cq *cq, *tmp;
+ LIST_HEAD(list);
- spin_lock_irq(&cct->task_lock);
- cql_empty = list_empty(&cct->cq_list);
- spin_unlock_irq(&cct->task_lock);
- if (!cql_empty)
- run_comp_task(__cct);
+ spin_lock_irq(&cct->task_lock);
+ cct->cq_jobs = 0;
+ cct->active = 0;
+ list_splice_init(&cct->cq_list, &list);
+ spin_unlock_irq(&cct->task_lock);
- set_current_state(TASK_INTERRUPTIBLE);
+ cpu = find_next_online_cpu(pool);
+ target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ thread = per_cpu_ptr(pool->cpu_comp_threads, cpu);
+ spin_lock_irq(&target->task_lock);
+ list_for_each_entry_safe(cq, tmp, &list, entry) {
+ list_del(&cq->entry);
+ __queue_comp_task(cq, target, thread);
}
- __set_current_state(TASK_RUNNING);
-
- return 0;
-}
-
-static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
- int cpu)
-{
- struct ehca_cpu_comp_task *cct;
-
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- spin_lock_init(&cct->task_lock);
- INIT_LIST_HEAD(&cct->cq_list);
- init_waitqueue_head(&cct->wait_queue);
- cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
- "ehca_comp/%d", cpu);
-
- return cct->task;
+ spin_unlock_irq(&target->task_lock);
}
-static void destroy_comp_task(struct ehca_comp_pool *pool,
- int cpu)
+static void comp_task_stop(unsigned int cpu, bool online)
{
- struct ehca_cpu_comp_task *cct;
- struct task_struct *task;
- unsigned long flags_cct;
-
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
- spin_lock_irqsave(&cct->task_lock, flags_cct);
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- task = cct->task;
- cct->task = NULL;
+ spin_lock_irq(&cct->task_lock);
cct->cq_jobs = 0;
-
- spin_unlock_irqrestore(&cct->task_lock, flags_cct);
-
- if (task)
- kthread_stop(task);
+ cct->active = 0;
+ WARN_ON(!list_empty(&cct->cq_list));
+ spin_unlock_irq(&cct->task_lock);
}
-static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu)
+static int comp_task_should_run(unsigned int cpu)
{
struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- LIST_HEAD(list);
- struct ehca_cq *cq;
- unsigned long flags_cct;
-
- spin_lock_irqsave(&cct->task_lock, flags_cct);
-
- list_splice_init(&cct->cq_list, &list);
-
- while (!list_empty(&list)) {
- cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-
- list_del(&cq->entry);
- __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
- }
-
- spin_unlock_irqrestore(&cct->task_lock, flags_cct);
+ return cct->cq_jobs;
}
-static int __cpuinit comp_pool_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int comp_task(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct ehca_cpu_comp_task *cct;
+ struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+ int cql_empty;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
- if (!create_comp_task(pool, cpu)) {
- ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
- return notifier_from_errno(-ENOMEM);
- }
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- kthread_bind(cct->task, cpumask_any(cpu_online_mask));
- destroy_comp_task(pool, cpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
- cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
- kthread_bind(cct->task, cpu);
- wake_up_process(cct->task);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
- destroy_comp_task(pool, cpu);
- take_over_work(pool, cpu);
- break;
+ spin_lock_irq(&cct->task_lock);
+ cql_empty = list_empty(&cct->cq_list);
+ if (!cql_empty) {
+ __set_current_state(TASK_RUNNING);
+ run_comp_task(cct);
}
-
- return NOTIFY_OK;
+ spin_unlock_irq(&cct->task_lock);
}
-static struct notifier_block comp_pool_callback_nb __cpuinitdata = {
- .notifier_call = comp_pool_callback,
- .priority = 0,
+static struct smp_hotplug_thread comp_pool_threads = {
+ .thread_should_run = comp_task_should_run,
+ .thread_fn = comp_task,
+ .thread_comm = "ehca_comp/%u",
+ .cleanup = comp_task_stop,
+ .park = comp_task_park,
};
int ehca_create_comp_pool(void)
{
- int cpu;
- struct task_struct *task;
+ int cpu, ret = -ENOMEM;
if (!ehca_scaling_code)
return 0;
@@ -905,38 +828,46 @@ int ehca_create_comp_pool(void)
pool->last_cpu = cpumask_any(cpu_online_mask);
pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
- if (pool->cpu_comp_tasks == NULL) {
- kfree(pool);
- return -EINVAL;
- }
+ if (!pool->cpu_comp_tasks)
+ goto out_pool;
- for_each_online_cpu(cpu) {
- task = create_comp_task(pool, cpu);
- if (task) {
- kthread_bind(task, cpu);
- wake_up_process(task);
- }
+ pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+ if (!pool->cpu_comp_threads)
+ goto out_tasks;
+
+ for_each_present_cpu(cpu) {
+ struct ehca_cpu_comp_task *cct;
+
+ cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ spin_lock_init(&cct->task_lock);
+ INIT_LIST_HEAD(&cct->cq_list);
}
- register_hotcpu_notifier(&comp_pool_callback_nb);
+ comp_pool_threads.store = pool->cpu_comp_threads;
+ ret = smpboot_register_percpu_thread(&comp_pool_threads);
+ if (ret)
+ goto out_threads;
- printk(KERN_INFO "eHCA scaling code enabled\n");
+ pr_info("eHCA scaling code enabled\n");
+ return ret;
- return 0;
+out_threads:
+ free_percpu(pool->cpu_comp_threads);
+out_tasks:
+ free_percpu(pool->cpu_comp_tasks);
+out_pool:
+ kfree(pool);
+ return ret;
}
void ehca_destroy_comp_pool(void)
{
- int i;
-
if (!ehca_scaling_code)
return;
- unregister_hotcpu_notifier(&comp_pool_callback_nb);
-
- for_each_online_cpu(i)
- destroy_comp_task(pool, i);
+ smpboot_unregister_percpu_thread(&comp_pool_threads);
+ free_percpu(pool->cpu_comp_threads);
free_percpu(pool->cpu_comp_tasks);
kfree(pool);
}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
index 3346cb06cea..5370199f08c 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
void ehca_process_eq(struct ehca_shca *shca, int is_irq);
struct ehca_cpu_comp_task {
- wait_queue_head_t wait_queue;
struct list_head cq_list;
- struct task_struct *task;
spinlock_t task_lock;
int cq_jobs;
+ int active;
};
struct ehca_comp_pool {
- struct ehca_cpu_comp_task *cpu_comp_tasks;
+ struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+ struct task_struct * __percpu *cpu_comp_threads;
int last_cpu;
spinlock_t last_cpu_lock;
};
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 22ccf9dee17..8d816646f76 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data,
+ unsigned int cpu,
+ const char *namefmt);
+
/**
* kthread_run - create and wake a thread.
* @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
void kthread_bind(struct task_struct *k, unsigned int cpu);
int kthread_stop(struct task_struct *k);
-int kthread_should_stop(void);
+bool kthread_should_stop(void);
+bool kthread_should_park(void);
bool kthread_freezable_should_stop(bool *was_frozen);
void *kthread_data(struct task_struct *k);
+int kthread_park(struct task_struct *k);
+void kthread_unpark(struct task_struct *k);
+void kthread_parkme(void);
int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85070c6581a..08854add569 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -334,14 +334,6 @@ static inline void lockup_detector_init(void)
}
#endif
-#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND)
-void lockup_detector_bootcpu_resume(void);
-#else
-static inline void lockup_detector_bootcpu_resume(void)
-{
-}
-#endif
-
#ifdef CONFIG_DETECT_HUNG_TASK
extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_check_count;
@@ -1114,6 +1106,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1148,6 +1141,15 @@ struct load_weight {
unsigned long weight, inv_weight;
};
+struct sched_avg {
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+ s64 decay_count;
+ unsigned long load_avg_contrib;
+ unsigned long load_avg_ratio;
+ u32 usage_avg_sum;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1208,6 +1210,15 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ /* Per-entity load-tracking */
+ struct sched_avg avg;
+#endif
};
struct sched_rt_entity {
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
new file mode 100644
index 00000000000..e0106d8581d
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_SMPBOOT_H
+#define _LINUX_SMPBOOT_H
+
+#include <linux/types.h>
+
+struct task_struct;
+/* Cookie handed to the thread_fn*/
+struct smpboot_thread_data;
+
+/**
+ * struct smp_hotplug_thread - CPU hotplug related thread descriptor
+ * @store: Pointer to per cpu storage for the task pointers
+ * @list: List head for core management
+ * @thread_should_run: Check whether the thread should run or not. Called with
+ * preemption disabled.
+ * @thread_fn: The associated thread function
+ * @setup: Optional setup function, called when the thread gets
+ * operational the first time
+ * @cleanup: Optional cleanup function, called when the thread
+ * should stop (module exit)
+ * @park: Optional park function, called when the thread is
+ * parked (cpu offline)
+ * @unpark: Optional unpark function, called when the thread is
+ * unparked (cpu online)
+ * @thread_comm: The base name of the thread
+ */
+struct smp_hotplug_thread {
+ struct task_struct __percpu **store;
+ struct list_head list;
+ int (*thread_should_run)(unsigned int cpu);
+ void (*thread_fn)(unsigned int cpu);
+ void (*setup)(unsigned int cpu);
+ void (*cleanup)(unsigned int cpu, bool online);
+ void (*park)(unsigned int cpu);
+ void (*unpark)(unsigned int cpu);
+ const char *thread_comm;
+};
+
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_thread_schedule(void);
+
+#endif
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a2035456..2c50a06fbed 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -426,6 +426,157 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+ TP_ARGS(tsk, load_contrib),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, load_contrib)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->load_contrib = load_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d load_contrib=%lu",
+ __entry->comm, __entry->pid,
+ __entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+ TP_PROTO(int cpu, unsigned long ratio),
+
+ TP_ARGS(cpu, ratio),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("cpu=%d ratio=%lu",
+ __entry->cpu,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+ TP_PROTO(int cpu, u64 load),
+
+ TP_ARGS(cpu, load),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(u64, load)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ ),
+
+ TP_printk("cpu=%d load=%llu",
+ __entry->cpu,
+ __entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+ TP_PROTO(struct task_struct *tsk, int val),
+
+ TP_ARGS(tsk, val),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->val = val;
+ ),
+
+ TP_printk("comm=%s pid=%d val=%d",
+ __entry->comm, __entry->pid,
+ __entry->val)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764..e5602d32acb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
- async.o range.o groups.o lglock.o
+ async.o range.o groups.o lglock.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2d6ddc3409..74fd39a036f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__func__, cpu);
goto out_release;
}
+ smpboot_park_threads(cpu);
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out;
}
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
+
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b579af57ea1..146a6fa9682 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
};
struct kthread {
- int should_stop;
+ unsigned long flags;
+ unsigned int cpu;
void *data;
+ struct completion parked;
struct completion exited;
};
+enum KTHREAD_BITS {
+ KTHREAD_IS_PER_CPU = 0,
+ KTHREAD_SHOULD_STOP,
+ KTHREAD_SHOULD_PARK,
+ KTHREAD_IS_PARKED,
+};
+
#define to_kthread(tsk) \
container_of((tsk)->vfork_done, struct kthread, exited)
@@ -52,13 +61,29 @@ struct kthread {
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
{
- return to_kthread(current)->should_stop;
+ return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true. You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
+/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+static void __kthread_parkme(struct kthread *self)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+ if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+ complete(&self->parked);
+ schedule();
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ clear_bit(KTHREAD_IS_PARKED, &self->flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+ __kthread_parkme(to_kthread(current));
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
struct kthread self;
int ret;
- self.should_stop = 0;
+ self.flags = 0;
self.data = data;
init_completion(&self.exited);
+ init_completion(&self.parked);
current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
schedule();
ret = -EINTR;
- if (!self.should_stop)
- ret = threadfn(data);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ __kthread_parkme(&self);
+ ret = threadfn(data);
+ }
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data,
- int node,
+ void *data, int node,
const char namefmt[],
...)
{
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
+static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_THREAD_BOUND;
+}
+
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
@@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
WARN_ON(1);
return;
}
-
- /* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+ __kthread_bind(p, cpu);
}
EXPORT_SYMBOL(kthread_bind);
/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data, unsigned int cpu,
+ const char *namefmt)
+{
+ struct task_struct *p;
+
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+ cpu);
+ if (IS_ERR(p))
+ return p;
+ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ to_kthread(p)->cpu = cpu;
+ /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+ kthread_park(p);
+ return p;
+}
+
+static struct kthread *task_get_live_kthread(struct task_struct *k)
+{
+ struct kthread *kthread;
+
+ get_task_struct(k);
+ kthread = to_kthread(k);
+ /* It might have exited */
+ barrier();
+ if (k->vfork_done != NULL)
+ return kthread;
+ return NULL;
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+
+ if (kthread) {
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu);
+ wake_up_process(k);
+ }
+ }
+ put_task_struct(k);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+ int ret = -ENOSYS;
+
+ if (kthread) {
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
+ }
+ }
+ ret = 0;
+ }
+ put_task_struct(k);
+ return ret;
+}
+
+/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread;
+ struct kthread *kthread = task_get_live_kthread(k);
int ret;
trace_sched_kthread_stop(k);
- get_task_struct(k);
-
- kthread = to_kthread(k);
- barrier(); /* it might have exited */
- if (k->vfork_done != NULL) {
- kthread->should_stop = 1;
+ if (kthread) {
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1da39ea248f..c8b7446b27d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
- /* Kick the lockup detector */
- lockup_detector_bootcpu_resume();
-
Enable_cpus:
enable_nonboot_cpus();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e542e3e..11a4fdca1df 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -133,13 +133,12 @@ static int rcu_scheduler_fully_active __read_mostly;
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -1468,8 +1467,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
- rcu_stop_cpu_kthread(cpu);
- rcu_node_kthread_setaffinity(rnp, -1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -2594,12 +2592,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
- rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
+ rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4d29169f212..1224d4c0538 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -196,12 +196,6 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
- struct task_struct *node_kthread_task;
- /* kthread that takes care of this rcu_node */
- /* structure, for example, awakening the */
- /* per-CPU kthreads as needed. */
- unsigned int node_kthread_status;
- /* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
/*
@@ -468,7 +462,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +484,9 @@ static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm);
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+ struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c0df0..c1961aed121 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/
#include <linux/delay.h>
+#include <linux/smpboot.h>
#define RCU_KTHREAD_PRIO 1
@@ -1069,6 +1070,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1141,17 +1152,6 @@ static int rcu_boost(struct rcu_node *rnp)
}
/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost. We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
- invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
-/*
* Priority-boosting kthread. One per leaf rcu_node and one for the
* root rcu_node.
*/
@@ -1174,8 +1174,9 @@ static int rcu_boost_kthread(void *arg)
else
spincnt = 0;
if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End boost kthread@rcu_yield");
- rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ schedule_timeout_interruptible(2);
trace_rcu_utilization("Start boost kthread@rcu_yield");
spincnt = 0;
}
@@ -1213,8 +1214,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
t = rnp->boost_kthread_task;
- if (t != NULL)
- wake_up_process(t);
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1232,10 @@ static void invoke_rcu_callbacks_kthread(void)
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task))
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
local_irq_restore(flags);
}
@@ -1245,21 +1248,6 @@ static bool rcu_is_callbacks_kthread(void)
return __get_cpu_var(rcu_cpu_kthread_task) == current;
}
-/*
- * Set the affinity of the boost kthread. The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm)
-{
- struct task_struct *t;
-
- t = rnp->boost_kthread_task;
- if (t != NULL)
- set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1276,15 +1264,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index)
+ struct rcu_node *rnp)
{
+ int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
struct sched_param sp;
struct task_struct *t;
if (&rcu_preempt_state != rsp)
return 0;
+
+ if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ return 0;
+
rsp->boost = 1;
if (rnp->boost_kthread_task != NULL)
return 0;
@@ -1301,25 +1293,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1300,22 @@ static void rcu_kthread_do_work(void)
rcu_preempt_do_callbacks();
}
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+static void rcu_cpu_kthread_setup(unsigned int cpu)
{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
- atomic_or(rdp->grpmask, &rnp->wakemask);
- invoke_rcu_node_kthread(rnp);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+static void rcu_cpu_kthread_park(unsigned int cpu)
{
- struct sched_param sp;
- struct timer_list yield_timer;
- int prio = current->rt_priority;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- set_user_nice(current, 0);
- sp.sched_priority = prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
+ return __get_cpu_var(rcu_cpu_has_work);
}
/*
@@ -1440,138 +1323,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
-static int rcu_cpu_kthread(void *arg)
+static void rcu_cpu_kthread(unsigned int cpu)
{
- int cpu = (int)(long)arg;
- unsigned long flags;
- int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+ char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+ int spincnt;
- trace_rcu_utilization("Start CPU kthread@init");
- for (;;) {
- *statusp = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
+ for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
- local_bh_enable();
- break;
- }
*statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
- local_irq_save(flags);
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
work = *workp;
*workp = 0;
- local_irq_restore(flags);
+ local_irq_enable();
if (work)
rcu_kthread_do_work();
local_bh_enable();
- if (*workp != 0)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
- trace_rcu_utilization("Start CPU kthread@rcu_yield");
- spincnt = 0;
- }
- }
- *statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed. We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp = (struct rcu_node *)arg;
- struct sched_param sp;
- struct task_struct *t;
-
- for (;;) {
- rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- rcu_wait(atomic_read(&rnp->wakemask) != 0);
- rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = atomic_xchg(&rnp->wakemask, 0);
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
- if ((mask & 0x1) == 0)
- continue;
- preempt_disable();
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (!cpu_online(cpu) || t == NULL) {
- preempt_enable();
- continue;
- }
- per_cpu(rcu_cpu_has_work, cpu) = 1;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- preempt_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization("End CPU kthread@rcu_wait");
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
}
}
- /* NOTREACHED */
- rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
- return 0;
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization("Start CPU kthread@rcu_yield");
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization("End CPU kthread@rcu_yield");
+ *statusp = RCU_KTHREAD_WAITING;
}
/*
@@ -1583,17 +1363,17 @@ static int rcu_node_kthread(void *arg)
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
*/
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
+ struct task_struct *t = rnp->boost_kthread_task;
+ unsigned long mask = rnp->qsmaskinit;
cpumask_var_t cm;
int cpu;
- unsigned long mask = rnp->qsmaskinit;
- if (rnp->node_kthread_task == NULL)
+ if (!t)
return;
- if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
- cpumask_clear(cm);
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1383,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
cpumask_clear_cpu(cpu, cm);
WARN_ON_ONCE(cpumask_weight(cm) == 0);
}
- set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
- rcu_boost_kthread_setaffinity(rnp, cm);
+ set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held. So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
-{
- unsigned long flags;
- int rnp_index = rnp - &rsp->node[0];
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- rnp->qsmaskinit == 0)
- return 0;
- if (rnp->node_kthread_task == NULL) {
- t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->node_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = 99;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- }
- return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
static int __init rcu_spawn_kthreads(void)
{
- int cpu;
struct rcu_node *rnp;
+ int cpu;
rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
+ for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
return 0;
}
@@ -1670,11 +1424,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ if (rcu_scheduler_fully_active)
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1449,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index abffb486e94..31968931f14 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -108,11 +108,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 988c6323224..fe86c913fa6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1713,6 +1715,9 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ p->se.avg.decay_count = 0;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea..b9d54d0d7bb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22321db6495..76e40cbc14b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -958,18 +872,491 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 46742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7,
+ 0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86,
+ 0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581,
+ 0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9,
+ 0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/* Precomputed \Sum y^k { 1<=k<=n } */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ int local_n;
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* will be 32 bits if that's desirable */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ return SRR(val, 32);
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(int n)
{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p1
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable,
+ int running)
{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/* Synchronize an entity's decay with its parentin cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count += decays;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ s64 tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic64_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib, usage_contrib;
+
+ contrib = div_u64(sa->runnable_avg_sum << 12,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ usage_contrib = div_u64(sa->usage_avg_sum << 12,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div64_u64(contrib,
+ atomic64_read(&tg->load_avg) + 1);
+
+ /*
+ * Unlike a task-entity, a group entity may be using >=1 cpu globally.
+ * However, in the case that it's using <1 cpu we need to form a
+ * correction term so that we contribute the same load as a task of
+ * equal weight. (Global runnable time is taken as a fraction over
+ * 2^12.)
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < (1<<12)) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib /= (1<<12);
+ }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
+ contrib = se->avg.usage_avg_sum * scale_load_down(1024);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ trace_sched_task_usage_ratio(task_of(se), scale_load(contrib));
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * they their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic64_read(&cfs_rq->removed_load)) {
+ u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ update_cfs_shares(cfs_rq);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ u32 contrib;
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (rq->avg.runnable_avg_period + 1);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ }
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } else {
+ se->avg.decay_count = 0;
+ }
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1096,9 +1483,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1190,9 +1576,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
- se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1206,7 +1591,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ se->on_rq = 0;
}
/*
@@ -1261,6 +1646,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -1340,6 +1726,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1353,9 +1741,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1448,6 +1837,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1592,14 +1990,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1611,9 +2004,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq->clock_task;
cfs_rq->throttle_count++;
return 0;
@@ -1628,7 +2021,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1652,7 +2045,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2063,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -2073,8 +2465,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2604,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}
@@ -2266,12 +2665,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2667,6 +3068,67 @@ static int select_idle_sibling(struct task_struct *p, int target)
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/* Heterogenous multiprocessor (HMP) optimizations
+ * We need to know which cpus that are fast and slow. */
+static struct cpumask hmp_fast_cpu_mask;
+static struct cpumask hmp_slow_cpu_mask;
+
+extern void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+ struct cpumask *slow);
+
+/* Setup fast and slow cpumasks. */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+
+ arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask);
+
+ printk(KERN_DEBUG "Initializing HMP scheduler:\n");
+ cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask);
+ printk(KERN_DEBUG " fast cpus: %s\n", buf);
+ cpulist_scnprintf(buf, 64, &hmp_slow_cpu_mask);
+ printk(KERN_DEBUG " slow cpus: %s\n", buf);
+
+ return 1;
+}
+early_initcall(hmp_cpu_mask_setup);
+
+/* Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a fast cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slow cpu
+ * hmp_up_prio: min. task prio for tasks migrating to faster cpus */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+unsigned int hmp_up_prio = 125;
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+
+static unsigned int hmp_cpu_is_fast(int cpu)
+{
+ return cpumask_test_cpu(cpu, &hmp_fast_cpu_mask);
+}
+
+static unsigned int hmp_cpu_is_slow(int cpu)
+{
+ return cpumask_test_cpu(cpu, &hmp_slow_cpu_mask);
+}
+
+/* Select target cpu for HMP migration to fast cpu
+ * returns target >= nr_cpu_ids if no fast cpus in affinity mask */
+static inline unsigned int hmp_select_fast_cpu(struct task_struct *tsk)
+{
+ return cpumask_any_and(&hmp_fast_cpu_mask, tsk_cpus_allowed(tsk));
+}
+
+/* Select target cpu for HMP migration to slow cpu
+ * returns target >= nr_cpu_ids if no slow cpus in affinity mask */
+static inline unsigned int hmp_select_slow_cpu(struct task_struct *tsk)
+{
+ return cpumask_any_and(&hmp_slow_cpu_mask, tsk_cpus_allowed(tsk));
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -2793,8 +3255,48 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_up_migration(new_cpu, &p->se)) {
+ return hmp_select_fast_cpu(p);
+ }
+ if (hmp_down_migration(new_cpu, &p->se)) {
+ return hmp_select_slow_cpu(p);
+ }
+#endif
+
return new_cpu;
}
+
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including rq->lock state, should be made.
+ * Caller guarantees p->pi_lock held, but nothing else.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a non-zero decay-count.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ }
+}
+#endif
+
#endif /* CONFIG_SMP */
static unsigned long
@@ -3311,51 +3813,65 @@ next:
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+ if (se)
+ update_entity_load_avg(se, 1);
+ else
+ update_rq_runnable_avg(rq_of(cfs_rq), 1);
- return 0;
+ if (se) {
+ /*
+ * We can pivot on the runnable average decaying to zero for
+ * list removal since the parent average will always be >=
+ * child.
+ */
+ if (se->avg.runnable_avg_sum)
+ update_cfs_shares(cfs_rq);
+ else
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+
+ unsigned long flags;
+ int num_updates = 0;
rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Periodically release the lock so that a cfs_rq with many
+ * children cannot hold it for an arbitrary period of time.
+ */
+ if (num_updates++ % 20 == 0) {
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ }
}
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rcu_read_unlock();
}
@@ -3400,7 +3916,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
@@ -4468,12 +4984,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4733,7 +5251,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -4901,6 +5419,225 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster core */
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ if (p->prio < hmp_up_prio && p->prio > 100
+ && hmp_cpu_is_slow(cpu)
+ && cpumask_intersects(&hmp_fast_cpu_mask, tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio > hmp_up_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Check if task should migrate to a slower core */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ if (p->prio >= hmp_up_prio || (hmp_cpu_is_fast(cpu)
+ && cpumask_intersects(&hmp_slow_cpu_mask, tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio < hmp_down_threshold)) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/* hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu. */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int i;
+ struct sched_entity *curr;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_cpu(i, &hmp_slow_cpu_mask) {
+ force = 0;
+ target = cpu_rq(i);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr || !entity_is_task(curr)) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ p = task_of(curr);
+ if (hmp_up_migration(i, curr)) {
+ if (!target->active_balance) {
+ target->active_balance = 1;
+ target->push_cpu = hmp_select_fast_cpu(p);
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, 1);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -4912,6 +5649,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -4966,6 +5705,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5058,6 +5799,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (p->se.avg.decay_count) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ __synchronize_entity_decay(&p->se);
+ subtract_blocked_load_contrib(cfs_rq,
+ p->se.avg.load_avg_contrib);
+ p->se.avg.decay_count = 0;
+ }
+#endif
}
/*
@@ -5104,11 +5860,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic64_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct cfs_rq *cfs_rq;
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5140,8 +5901,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(&p->se);
+ p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * set_task_rq will() have removed our previous contribution,
+ * but we must synchronize explicitly against further decay
+ * here.
+ */
+ p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5226,10 +5998,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5276,8 +6044,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
update_cfs_shares(group_cfs_rq(se));
+ /* update contribution to parent */
+ update_entity_load_avg(se, 1);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5331,7 +6102,9 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .migrate_task_rq = migrate_task_rq_fair,
+#endif
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c..d98ae909e32 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -42,7 +42,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Use arch dependent cpu power functions
*/
-SCHED_FEAT(ARCH_POWER, false)
+SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d..129e35394b5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
unsigned long shares;
atomic_t load_weight;
+ atomic64_t load_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
+#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
/*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ u64 runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter, removed_load;
+ u64 last_decay;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ u32 tg_runnable_contrib, tg_usage_contrib;
+ u64 tg_load_contrib;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* h_load = weight * f(tg)
*
@@ -245,26 +254,30 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -408,6 +421,7 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+ struct task_struct *migrate_task;
/* cpu of this runqueue: */
int cpu;
int online;
@@ -463,6 +477,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1..0a49ee70054 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
/*
* Common SMP CPU bringup/teardown functions
*/
+#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
#include "smpboot.h"
+#ifdef CONFIG_SMP
+
#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
/*
* For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
}
}
#endif
+
+#endif /* #ifdef CONFIG_SMP */
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @void: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+ struct smpboot_thread_data *td = data;
+ struct smp_hotplug_thread *ht = td->ht;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ preempt_disable();
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 0;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ BUG_ON(td->cpu != smp_processor_id());
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ continue;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ /* Check for state change setup */
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ preempt_enable();
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ case HP_THREAD_PARKED:
+ preempt_enable();
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ }
+
+ if (!ht->thread_should_run(td->cpu)) {
+ preempt_enable();
+ schedule();
+ } else {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ ht->thread_fn(td->cpu);
+ }
+ }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+ ht->thread_comm);
+ if (IS_ERR(tsk)) {
+ kfree(td);
+ return PTR_ERR(tsk);
+ }
+
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6ef9433e1c7..72415a0eb95 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
static inline void idle_threads_init(void) { }
#endif
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b73e681df09..e04e5b28366 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <linux/tick.h>
#define CREATE_TRACE_POINTS
@@ -742,49 +743,22 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
{
- set_current_state(TASK_INTERRUPTIBLE);
-
- while (!kthread_should_stop()) {
- preempt_disable();
- if (!local_softirq_pending()) {
- schedule_preempt_disabled();
- }
-
- __set_current_state(TASK_RUNNING);
-
- while (local_softirq_pending()) {
- /* Preempt disable stops cpu going offline.
- If already offline, we'll be on wrong CPU:
- don't process */
- if (cpu_is_offline((long)__bind_cpu))
- goto wait_to_die;
- local_irq_disable();
- if (local_softirq_pending())
- __do_softirq();
- local_irq_enable();
- sched_preempt_enable_no_resched();
- cond_resched();
- preempt_disable();
- rcu_note_context_switch((long)__bind_cpu);
- }
- preempt_enable();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- return 0;
+ return local_softirq_pending();
+}
-wait_to_die:
- preempt_enable();
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ __do_softirq();
+ rcu_note_context_switch(cpu);
+ local_irq_enable();
+ cond_resched();
+ return;
}
- __set_current_state(TASK_RUNNING);
- return 0;
+ local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
-
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- p = kthread_create_on_node(run_ksoftirqd,
- hcpu,
- cpu_to_node(hotcpu),
- "ksoftirqd/%d", hotcpu);
- if (IS_ERR(p)) {
- printk("ksoftirqd for %i failed\n", hotcpu);
- return notifier_from_errno(PTR_ERR(p));
- }
- kthread_bind(p, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = p;
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- wake_up_process(per_cpu(ksoftirqd, hotcpu));
- break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!per_cpu(ksoftirqd, hotcpu))
- break;
- /* Unbind so it can run. Fall thru. */
- kthread_bind(per_cpu(ksoftirqd, hotcpu),
- cpumask_any(cpu_online_mask));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
- static const struct sched_param param = {
- .sched_priority = MAX_RT_PRIO-1
- };
-
- p = per_cpu(ksoftirqd, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = NULL;
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- kthread_stop(p);
+ int hotcpu = (unsigned long)hcpu;
+
takeover_tasklets(hotcpu);
break;
}
#endif /* CONFIG_HOTPLUG_CPU */
- }
+ }
return NOTIFY_OK;
}
@@ -901,14 +842,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
+static struct smp_hotplug_thread softirq_threads = {
+ .store = &ksoftirqd,
+ .thread_should_run = ksoftirqd_should_run,
+ .thread_fn = run_ksoftirqd,
+ .thread_comm = "ksoftirqd/%u",
+};
+
static __init int spawn_ksoftirqd(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
- BUG_ON(err != NOTIFY_OK);
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
+
+ BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
return 0;
}
early_initcall(spawn_ksoftirqd);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69add8a9da6..9d4c8d5a1f5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/smpboot.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
int watchdog_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_disabled;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+ struct sched_param param = { .sched_priority = prio };
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+ sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
{
- struct sched_param param = { .sched_priority = 0 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- /* initialize timestamp */
- __touch_watchdog();
+ if (!watchdog_enabled) {
+ kthread_park(current);
+ return;
+ }
+
+ /* Enable the perf event */
+ watchdog_nmi_enable(cpu);
/* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
HRTIMER_MODE_REL_PINNED);
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly (kicked by the hrtimer callback function) once every
- * get_sample_period() seconds (4 seconds by default) to reset the
- * softlockup timestamp. If this gets delayed for more than
- * 2*watchdog_thresh seconds then the debug-printout triggers in
- * watchdog_timer_fn().
- */
- while (!kthread_should_stop()) {
- __touch_watchdog();
- schedule();
+ /* initialize timestamp */
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ __touch_watchdog();
+}
+
+static void watchdog_disable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- if (kthread_should_stop())
- break;
+ watchdog_set_prio(SCHED_NORMAL, 0);
+ hrtimer_cancel(hrtimer);
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
+}
- set_current_state(TASK_INTERRUPTIBLE);
- }
- /*
- * Drop the policy/priority elevation during thread exit to avoid a
- * scheduling latency spike.
- */
- __set_current_state(TASK_RUNNING);
- sched_setscheduler(current, SCHED_NORMAL, &param);
- return 0;
+static int watchdog_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(hrtimer_interrupts) !=
+ __this_cpu_read(soft_lockup_hrtimer_cnt);
}
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every get_sample_period() seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
*/
static unsigned long cpu0_err;
-static int watchdog_nmi_enable(int cpu)
+static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
return 0;
}
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
return;
}
#else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- WARN_ON(per_cpu(softlockup_watchdog, cpu));
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-}
-
-static int watchdog_enable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err = 0;
-
- /* enable the perf event */
- err = watchdog_nmi_enable(cpu);
-
- /* Regardless of err above, fall through and start softlockup */
-
- /* create the watchdog thread */
- if (!p) {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
- if (IS_ERR(p)) {
- pr_err("softlockup watchdog for %i failed\n", cpu);
- if (!err) {
- /* if hardlockup hasn't already set this */
- err = PTR_ERR(p);
- /* and disable the perf event */
- watchdog_nmi_disable(cpu);
- }
- goto out;
- }
- sched_setscheduler(p, SCHED_FIFO, &param);
- kthread_bind(p, cpu);
- per_cpu(watchdog_touch_ts, cpu) = 0;
- per_cpu(softlockup_watchdog, cpu) = p;
- wake_up_process(p);
- }
-
-out:
- return err;
-}
-
-static void watchdog_disable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- /*
- * cancel the timer first to stop incrementing the stats
- * and waking up the kthread
- */
- hrtimer_cancel(hrtimer);
-
- /* disable the perf event */
- watchdog_nmi_disable(cpu);
-
- /* stop the watchdog thread */
- if (p) {
- per_cpu(softlockup_watchdog, cpu) = NULL;
- kthread_stop(p);
- }
-}
-
/* sysctl functions */
#ifdef CONFIG_SYSCTL
static void watchdog_enable_all_cpus(void)
{
- int cpu;
-
- watchdog_enabled = 0;
-
- for_each_online_cpu(cpu)
- if (!watchdog_enable(cpu))
- /* if any cpu succeeds, watchdog is considered
- enabled for the system */
- watchdog_enabled = 1;
-
- if (!watchdog_enabled)
- pr_err("failed to be enabled on some cpus\n");
+ unsigned int cpu;
+ if (watchdog_disabled) {
+ watchdog_disabled = 0;
+ for_each_online_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
}
static void watchdog_disable_all_cpus(void)
{
- int cpu;
+ unsigned int cpu;
- for_each_online_cpu(cpu)
- watchdog_disable(cpu);
-
- /* if all watchdogs are disabled, then they are disabled for the system */
- watchdog_enabled = 0;
+ if (!watchdog_disabled) {
+ watchdog_disabled = 1;
+ for_each_online_cpu(cpu)
+ kthread_park(per_cpu(softlockup_watchdog, cpu));
+ }
}
-
/*
* proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/
@@ -557,90 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
{
int ret;
+ if (watchdog_disabled < 0)
+ return -ENODEV;
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
- goto out;
+ return ret;
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
watchdog_disable_all_cpus();
-out:
return ret;
}
#endif /* CONFIG_SYSCTL */
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- watchdog_prepare_cpu(hotcpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (watchdog_enabled)
- watchdog_enable(hotcpu);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- watchdog_disable(hotcpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- watchdog_disable(hotcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
-
- /*
- * hardlockup and softlockup are not important enough
- * to block cpu bring up. Just always succeed and
- * rely on printk output to flag problems.
- */
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
+static struct smp_hotplug_thread watchdog_threads = {
+ .store = &softlockup_watchdog,
+ .thread_should_run = watchdog_should_run,
+ .thread_fn = watchdog,
+ .thread_comm = "watchdog/%u",
+ .setup = watchdog_enable,
+ .park = watchdog_disable,
+ .unpark = watchdog_enable,
};
-#ifdef CONFIG_SUSPEND
-/*
- * On exit from suspend we force an offline->online transition on the boot CPU
- * so that the PMU state that was lost while in suspended state gets set up
- * properly for the boot CPU. This information is required for restarting the
- * NMI watchdog.
- */
-void lockup_detector_bootcpu_resume(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
-
- cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
- cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
-}
-#endif
-
void __init lockup_detector_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- WARN_ON(notifier_to_errno(err));
-
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
- register_cpu_notifier(&cpu_nfb);
-
- return;
+ if (smpboot_register_percpu_thread(&watchdog_threads)) {
+ pr_err("Failed to create watchdog threads, disabled\n");
+ watchdog_disabled = -ENODEV;
+ }
}
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 00000000000..df35474eff1
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,9 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK="0-1"
+CONFIG_HMP_SLOW_CPU_MASK="2-3"