aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c359
1 files changed, 315 insertions, 44 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8d43347a0c0..5b3f6ec1b0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -73,13 +73,15 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -119,12 +121,6 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
#ifdef CONFIG_SMP
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -632,6 +628,10 @@ struct rq {
struct list_head migration_queue;
#endif
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
@@ -1730,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
+static void calc_load_account_active(struct rq *this_rq);
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1960,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
clock_offset = old_rq->clock - new_rq->clock;
- trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ trace_sched_migrate_task(p, new_cpu);
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
@@ -2021,6 +2023,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
}
/*
+ * wait_task_context_switch - wait for a thread to complete at least one
+ * context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+ unsigned long nvcsw, nivcsw, flags;
+ int running;
+ struct rq *rq;
+
+ nvcsw = p->nvcsw;
+ nivcsw = p->nivcsw;
+ for (;;) {
+ /*
+ * The runqueue is assigned before the actual context
+ * switch. We need to take the runqueue lock.
+ *
+ * We could check initially without the lock but it is
+ * very likely that we need to take the lock in every
+ * iteration.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ task_rq_unlock(rq, &flags);
+
+ if (likely(!running))
+ break;
+ /*
+ * The switch count is incremented before the actual
+ * context switch. We thus wait for two switches to be
+ * sure at least one completed.
+ */
+ if ((p->nvcsw - nvcsw) > 1)
+ break;
+ if ((p->nivcsw - nivcsw) > 1)
+ break;
+
+ cpu_relax();
+ }
+}
+
+/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2485,6 +2530,17 @@ out:
return success;
}
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes. Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_ALL, 0);
@@ -2795,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* combine the page table reload and the switch backend into
* one hypercall.
*/
- arch_enter_lazy_cpu_mode();
+ arch_start_context_switch(prev);
if (unlikely(!mm)) {
next->active_mm = oldmm;
@@ -2885,19 +2941,72 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- unsigned long i, running = 0, uninterruptible = 0;
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
- for_each_online_cpu(i) {
- running += cpu_rq(i)->nr_running;
- uninterruptible += cpu_rq(i)->nr_uninterruptible;
- }
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+ unsigned long upd = calc_load_update + 10;
+ long active;
- if (unlikely((long)uninterruptible < 0))
- uninterruptible = 0;
+ if (time_before(jiffies, upd))
+ return;
- return running + uninterruptible;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long nr_active, delta;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ atomic_long_add(delta, &calc_load_tasks);
+ }
}
/*
@@ -2937,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
+
+ if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ this_rq->calc_load_update += LOAD_FREQ;
+ calc_load_account_active(this_rq);
+ }
}
#ifdef CONFIG_SMP
@@ -4278,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
static struct {
atomic_t load_balancer;
cpumask_var_t cpu_mask;
+ cpumask_var_t ilb_grp_nohz_mask;
} nohz ____cacheline_aligned = {
.load_balancer = ATOMIC_INIT(-1),
};
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu: The cpu whose lowest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the lowest sched_domain
+ * for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd)
+ if (sd && (sd->flags & flag))
+ break;
+
+ return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu: The cpu whose domains we're iterating over.
+ * @sd: variable holding the value of the power_savings_sd
+ * for cpu.
+ * @flag: The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+ for (sd = lowest_flag_domain(cpu, flag); \
+ (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns: 1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+ cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ sched_group_cpus(ilb_group));
+
+ /*
+ * A sched_group is semi-idle when it has atleast one busy cpu
+ * and atleast one idle cpu.
+ */
+ if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ return 0;
+
+ if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ return 0;
+
+ return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu: The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns: Returns the id of the idle load balancer if it exists,
+ * Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *ilb_group;
+
+ /*
+ * Have idle load balancer selection from semi-idle packages only
+ * when power-aware load balancing is enabled
+ */
+ if (!(sched_smt_power_savings || sched_mc_power_savings))
+ goto out_done;
+
+ /*
+ * Optimize for the case when we have no idle CPUs or only one
+ * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ */
+ if (cpumask_weight(nohz.cpu_mask) < 2)
+ goto out_done;
+
+ for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+ ilb_group = sd->groups;
+
+ do {
+ if (is_semi_idle_group(ilb_group))
+ return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+ ilb_group = ilb_group->next;
+
+ } while (ilb_group != sd->groups);
+ }
+
+out_done:
+ return cpumask_first(nohz.cpu_mask);
+}
+#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+ return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4336,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick)
/* make me the ilb owner */
if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu)
+ } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ int new_ilb;
+
+ if (!(sched_smt_power_savings ||
+ sched_mc_power_savings))
+ return 1;
+ /*
+ * Check to see if there is a more power-efficient
+ * ilb.
+ */
+ new_ilb = find_new_ilb(cpu);
+ if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+ atomic_set(&nohz.load_balancer, -1);
+ resched_cpu(new_ilb);
+ return 0;
+ }
return 1;
+ }
} else {
if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
return 0;
@@ -4506,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
}
if (atomic_read(&nohz.load_balancer) == -1) {
- /*
- * simple selection for now: Nominate the
- * first cpu in the nohz list to be the next
- * ilb owner.
- *
- * TBD: Traverse the sched domains and nominate
- * the nearest cpu in the nohz.cpu_mask.
- */
- int ilb = cpumask_first(nohz.cpu_mask);
+ int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
@@ -5047,13 +5285,15 @@ pick_next_task(struct rq *rq)
/*
* schedule() is the main scheduler function.
*/
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
+need_resched:
+ preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);
@@ -5111,15 +5351,9 @@ need_resched_nonpreemptible:
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
- preempt_disable();
- __schedule();
preempt_enable_no_resched();
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ if (need_resched())
goto need_resched;
}
EXPORT_SYMBOL(schedule);
@@ -5262,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
{
wait_queue_t *curr, *next;
@@ -5282,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
@@ -5320,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
@@ -5356,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
@@ -5373,6 +5616,9 @@ EXPORT_SYMBOL(complete);
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
@@ -6531,8 +6777,9 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
- printk(KERN_CONT "%5lu %5d %6d\n", free,
- task_pid_nr(p), task_pid_nr(p->real_parent));
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), task_pid_nr(p->real_parent),
+ (unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
}
@@ -7011,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
}
}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7245,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
+ rq->calc_load_update = calc_load_update;
+ rq->calc_load_active = 0;
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7284,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
-
+ calc_global_load_remove(rq);
/*
* No need to migrate the tasks: it was best-effort if
* they didn't take sched_hotcpu_mutex. Just wake up
@@ -7796,8 +8053,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ * and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
@@ -7918,7 +8176,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j).sd;
- if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+ if (j != group_first_cpu(sd->groups)) {
/*
* Only add "power" once for each
* physical package.
@@ -7996,7 +8254,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
WARN_ON(!sd || !sd->groups);
- if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+ if (cpu != group_first_cpu(sd->groups))
return;
child = sd->child;
@@ -8981,6 +9239,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9088,6 +9348,9 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
/*
* During early bootup we pretend to be a normal task:
*/
@@ -9098,6 +9361,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
#endif
alloc_bootmem_cpumask_var(&cpu_isolated_map);
#endif /* SMP */
@@ -9845,6 +10109,13 @@ static int sched_rt_global_constraints(void)
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
+ /*
+ * There's always some RT tasks in the root group
+ * -- migration, kstopmachine etc..
+ */
+ if (sysctl_sched_rt_runtime == 0)
+ return -EBUSY;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;