summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/smp.c20
-rw-r--r--drivers/cpuidle/Kconfig4
-rw-r--r--drivers/cpuidle/cpuidle.c21
-rw-r--r--drivers/cpuidle/governors/Makefile1
-rw-r--r--drivers/cpuidle/governors/ladder.c11
-rw-r--r--drivers/cpuidle/governors/menu.c15
-rw-r--r--drivers/cpuidle/governors/select.c55
-rw-r--r--drivers/cpuidle/sysfs.c156
-rw-r--r--include/linux/cpuidle.h23
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/sched/sysctl.h8
-rw-r--r--include/trace/events/io_latency.h32
-rw-r--r--init/Kconfig11
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/idle.c33
-rw-r--r--kernel/sched/idle_debug.c126
-rw-r--r--kernel/sched/idle_debug.h29
-rw-r--r--kernel/sched/io_latency.c442
-rw-r--r--kernel/sched/io_latency.h38
-rw-r--r--kernel/sysctl.c11
-rw-r--r--lib/Kconfig.debug9
25 files changed, 1052 insertions, 33 deletions
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa..d193609beabc 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -31,6 +31,16 @@
#include <asm/apic.h>
#include <asm/nmi.h>
#include <asm/trace/irq_vectors.h>
+
+#define CREATE_TRACE_POINTS
+/*
+ * Those were defined in <asm/trace/irq_vectors.h> and cause problems
+ * when including <trace/events/ipi.h>.
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#include <trace/events/ipi.h>
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -124,11 +134,13 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
+ trace_ipi_raise(cpumask_of(cpu), tracepoint_string("RESCHEDULE"));
apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
+ trace_ipi_raise(cpumask_of(cpu), tracepoint_string("CALL_FUNCTION_SINGLE"));
apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
@@ -136,6 +148,8 @@ void native_send_call_func_ipi(const struct cpumask *mask)
{
cpumask_var_t allbutself;
+ trace_ipi_raise(mask, tracepoint_string("CALL_FUNCTION"));
+
if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
return;
@@ -252,8 +266,10 @@ finish:
*/
static inline void __smp_reschedule_interrupt(void)
{
+ trace_ipi_entry(tracepoint_string("RESCHEDULE"));
inc_irq_stat(irq_resched_count);
scheduler_ipi();
+ trace_ipi_exit(tracepoint_string("RESCHEDULE"));
}
__visible void smp_reschedule_interrupt(struct pt_regs *regs)
@@ -291,8 +307,10 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
static inline void __smp_call_function_interrupt(void)
{
+ trace_ipi_entry(tracepoint_string("CALL_FUNCTION"));
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
+ trace_ipi_exit(tracepoint_string("CALL_FUNCTION"));
}
__visible void smp_call_function_interrupt(struct pt_regs *regs)
@@ -313,8 +331,10 @@ __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
static inline void __smp_call_function_single_interrupt(void)
{
+ trace_ipi_entry(tracepoint_string("CALL_FUNCTION_SINGLE"));
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
+ trace_ipi_exit(tracepoint_string("CALL_FUNCTION_SINGLE"));
}
__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c5029c1209b4..6deb2473c69c 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -25,6 +25,10 @@ config CPU_IDLE_GOV_MENU
bool "Menu governor (for tickless system)"
default y
+config CPU_IDLE_GOV_SELECT
+ bool "Select governor (for tickless system)"
+ default y
+
config DT_IDLE_STATES
bool
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150dc6e81..a79c4db22ffa 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,16 +8,12 @@
* This code is licenced under the GPL.
*/
-#include <linux/clockchips.h>
#include <linux/kernel.h>
#include <linux/mutex.h>
-#include <linux/sched.h>
#include <linux/notifier.h>
#include <linux/pm_qos.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
#include <linux/module.h>
#include <trace/events/power.h>
@@ -122,6 +118,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ktime_get();
+ target_state->idle_start = ktime_to_us(time_start);
+
entered_state = target_state->enter(dev, drv, index);
time_end = ktime_get();
@@ -134,6 +132,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
if (diff > INT_MAX)
diff = INT_MAX;
+ target_state->idle_start = 0;
+
dev->last_residency = (int) diff;
if (entered_state >= 0) {
@@ -143,6 +143,14 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
*/
dev->states_usage[entered_state].time += dev->last_residency;
dev->states_usage[entered_state].usage++;
+ if (diff < dev->last_residency)
+ atomic_inc(&dev->over_estimate);
+ else if (entered_state < (drv->state_count - 1) &&
+ dev->last_residency <
+ drv->states[entered_state + 1].target_residency)
+ atomic_inc(&dev->under_estimate);
+ else
+ atomic_inc(&dev->right_estimate);
} else {
dev->last_residency = 0;
}
@@ -158,7 +166,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
*
* Returns the index of the idle state.
*/
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ struct cpuidle_times *times)
{
if (off || !initialized)
return -ENODEV;
@@ -169,7 +178,7 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
if (unlikely(use_deepest_state))
return cpuidle_find_deepest_state(drv, dev);
- return cpuidle_curr_governor->select(drv, dev);
+ return cpuidle_curr_governor->select(drv, dev, times);
}
/**
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 1b512722689f..fa455206fede 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,3 +4,4 @@
obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_SELECT) += select.o
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 06b57c4c4d80..4f5ec8ade22e 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -64,18 +64,13 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
* @dev: the CPU
*/
static int ladder_select_state(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times)
{
struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
struct ladder_device_state *last_state;
+ int latency_req = times->latency_req;
int last_residency, last_idx = ldev->last_state_idx;
- int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-
- /* Special case when user has set very strict latency requirement */
- if (unlikely(latency_req == 0)) {
- ladder_do_selection(ldev, last_idx, 0);
- return 0;
- }
last_state = &ldev->states[last_idx];
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 710a233b9b0d..e360b08ea44a 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -13,10 +13,6 @@
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/pm_qos.h>
-#include <linux/time.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/module.h>
@@ -287,10 +283,11 @@ again:
* @drv: cpuidle driver containing state data
* @dev: the CPU
*/
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ struct cpuidle_times *times)
{
struct menu_device *data = this_cpu_ptr(&menu_devices);
- int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+ int latency_req = times->latency_req;
int i;
unsigned int interactivity_req;
unsigned long nr_iowaiters, cpu_load;
@@ -302,12 +299,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
- /* Special case when user has set very strict latency requirement */
- if (unlikely(latency_req == 0))
- return 0;
-
/* determine the expected residency time, round up */
- data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+ data->next_timer_us = times->next_timer_event;
get_iowait_load(&nr_iowaiters, &cpu_load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
diff --git a/drivers/cpuidle/governors/select.c b/drivers/cpuidle/governors/select.c
new file mode 100644
index 000000000000..2193b7870b34
--- /dev/null
+++ b/drivers/cpuidle/governors/select.c
@@ -0,0 +1,55 @@
+/*
+ * select.c - the select governor
+ *
+ * Copyright (C) 2014 Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+*/
+
+#include <linux/cpuidle.h>
+
+static int select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ struct cpuidle_times *times)
+{
+ int i, index = 0, latency_req = times->latency_req;
+ unsigned int next_event;
+
+ /*
+ * If the guessed IO next event is zero, that means there is no IO
+ * pending, so we ignore it in the equation
+ */
+ next_event = times->next_io_event ?
+ min(times->next_io_event, times->next_timer_event) :
+ times->next_timer_event;
+
+ for (i = 0; i < drv->state_count; i++) {
+
+ struct cpuidle_state *s = &drv->states[i];
+ struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+ if (s->disabled || su->disable)
+ continue;
+ if (s->target_residency > next_event)
+ continue;
+ if (s->exit_latency > latency_req)
+ continue;
+
+ index = i;
+ }
+
+ return index;
+}
+
+static struct cpuidle_governor select_governor = {
+ .name = "select",
+ .rating = 30,
+ .select = select,
+ .owner = THIS_MODULE,
+};
+
+static int __init select_init(void)
+{
+ return cpuidle_register_governor(&select_governor);
+}
+
+postcore_initcall(select_init);
+
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 97c5903b4606..f446bd0fd9bd 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -439,6 +439,154 @@ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
cpuidle_free_state_kobj(device, i);
}
+#define kobj_to_stats_kobj(k) container_of(k, struct cpuidle_stats_kobj, kobj)
+#define attr_to_stats_attr(a) container_of(a, struct cpuidle_stats_attr, attr)
+
+#define define_show_stats_function(_name) \
+ static ssize_t show_stats_##_name(struct cpuidle_device *dev, \
+ char *buf) \
+ { \
+ return sprintf(buf, "%d\n", atomic_read(&dev->_name)); \
+ }
+
+#define define_store_stats_function(_name) \
+ static ssize_t store_stats_##_name(struct cpuidle_device *dev, \
+ const char *buf, size_t size) \
+ { \
+ unsigned long long value; \
+ int err; \
+ if (!capable(CAP_SYS_ADMIN)) \
+ return -EPERM; \
+ err = kstrtoull(buf, 0, &value); \
+ if (err) \
+ return err; \
+ \
+ atomic_set(&dev->_name, value); \
+ return size; \
+ }
+
+#define define_one_stats_rw(_name, show, store) \
+ static struct cpuidle_stats_attr attr_stats_##_name = \
+ __ATTR(_name, 0644, show, store)
+
+struct cpuidle_stats_kobj {
+ struct cpuidle_device *dev;
+ struct completion kobj_unregister;
+ struct kobject kobj;
+};
+
+struct cpuidle_stats_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct cpuidle_device *, char *);
+ ssize_t (*store)(struct cpuidle_device *, const char *, size_t);
+};
+
+static void cpuidle_stats_sysfs_release(struct kobject *kobj)
+{
+ struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+ complete(&stats_kobj->kobj_unregister);
+}
+
+static ssize_t cpuidle_stats_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ int ret = -EIO;
+ struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+ struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+ if (dattr->show)
+ ret = dattr->show(stats_kobj->dev, buf);
+
+ return ret;
+}
+
+static ssize_t cpuidle_stats_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t size)
+{
+ int ret = -EIO;
+ struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+ struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+ if (dattr->store)
+ ret = dattr->store(stats_kobj->dev, buf, size);
+
+ return ret;
+}
+
+define_show_stats_function(right_estimate);
+define_store_stats_function(right_estimate);
+
+define_show_stats_function(under_estimate);
+define_store_stats_function(under_estimate);
+
+define_show_stats_function(over_estimate);
+define_store_stats_function(over_estimate);
+
+define_one_stats_rw(right_estimate,
+ show_stats_right_estimate,
+ store_stats_right_estimate);
+
+define_one_stats_rw(under_estimate,
+ show_stats_under_estimate,
+ store_stats_under_estimate);
+
+define_one_stats_rw(over_estimate,
+ show_stats_over_estimate,
+ store_stats_over_estimate);
+
+static const struct sysfs_ops cpuidle_stats_sysfs_ops = {
+ .show = cpuidle_stats_show,
+ .store = cpuidle_stats_store,
+};
+
+static struct attribute *cpuidle_stats_default_attrs[] = {
+ &attr_stats_right_estimate.attr,
+ &attr_stats_under_estimate.attr,
+ &attr_stats_over_estimate.attr,
+ NULL
+};
+
+static struct kobj_type ktype_stats_cpuidle = {
+ .sysfs_ops = &cpuidle_stats_sysfs_ops,
+ .default_attrs = cpuidle_stats_default_attrs,
+ .release = cpuidle_stats_sysfs_release,
+};
+
+static int cpuidle_add_stats_sysfs(struct cpuidle_device *dev)
+{
+ struct cpuidle_stats_kobj *kstats;
+ struct cpuidle_device_kobj *kdev = dev->kobj_dev;
+ int ret;
+
+ kstats = kzalloc(sizeof(*kstats), GFP_KERNEL);
+ if (!kstats)
+ return -ENOMEM;
+
+ kstats->dev = dev;
+ init_completion(&kstats->kobj_unregister);
+
+ ret = kobject_init_and_add(&kstats->kobj, &ktype_stats_cpuidle,
+ &kdev->kobj, "stats");
+ if (ret) {
+ kfree(kstats);
+ return ret;
+ }
+
+ kobject_uevent(&kstats->kobj, KOBJ_ADD);
+ dev->kobj_stats = kstats;
+
+ return ret;
+}
+
+static void cpuidle_remove_stats_sysfs(struct cpuidle_device *dev)
+{
+ struct cpuidle_stats_kobj *kstats = dev->kobj_stats;
+ kobject_put(&kstats->kobj);
+ wait_for_completion(&kstats->kobj_unregister);
+ kfree(kstats);
+}
+
#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
#define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj)
#define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr)
@@ -589,6 +737,13 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
ret = cpuidle_add_driver_sysfs(device);
if (ret)
cpuidle_remove_state_sysfs(device);
+
+ ret = cpuidle_add_stats_sysfs(device);
+ if (ret) {
+ cpuidle_remove_driver_sysfs(device);
+ cpuidle_remove_state_sysfs(device);
+ }
+
return ret;
}
@@ -598,6 +753,7 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
*/
void cpuidle_remove_device_sysfs(struct cpuidle_device *device)
{
+ cpuidle_remove_stats_sysfs(device);
cpuidle_remove_driver_sysfs(device);
cpuidle_remove_state_sysfs(device);
}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 25e0df6155a4..3ac0ded7d75a 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -44,7 +44,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned int target_residency; /* in US */
bool disabled; /* disabled on all CPUs */
-
+ s64 idle_start;
int (*enter) (struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index);
@@ -62,6 +62,7 @@ struct cpuidle_state {
struct cpuidle_device_kobj;
struct cpuidle_state_kobj;
struct cpuidle_driver_kobj;
+struct cpuidle_stats_kobj;
struct cpuidle_device {
unsigned int registered:1;
@@ -74,8 +75,13 @@ struct cpuidle_device {
struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
struct cpuidle_driver_kobj *kobj_driver;
struct cpuidle_device_kobj *kobj_dev;
+ struct cpuidle_stats_kobj *kobj_stats;
struct list_head device_list;
+ atomic_t right_estimate;
+ atomic_t under_estimate;
+ atomic_t over_estimate;
+
#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
int safe_state_index;
cpumask_t coupled_cpus;
@@ -83,6 +89,12 @@ struct cpuidle_device {
#endif
};
+struct cpuidle_times {
+ unsigned int latency_req;
+ unsigned int next_timer_event;
+ unsigned int next_io_event;
+};
+
DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
@@ -122,7 +134,8 @@ struct cpuidle_driver {
extern void disable_cpuidle(void);
extern int cpuidle_select(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times);
extern int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index);
extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -150,7 +163,8 @@ extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)
#else
static inline void disable_cpuidle(void) { }
static inline int cpuidle_select(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times)
{return -ENODEV; }
static inline int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
@@ -205,7 +219,8 @@ struct cpuidle_governor {
struct cpuidle_device *dev);
int (*select) (struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times);
void (*reflect) (struct cpuidle_device *dev, int index);
struct module *owner;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a100d8713..fc3a7cf107ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1284,6 +1284,24 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+
+#ifdef CONFIG_SCHED_IO_LATENCY
+struct io_latency_node {
+ struct rb_node node;
+ unsigned int avg_latency;
+ ktime_t start_time;
+ ktime_t end_time;
+ struct list_head bucket_list;
+};
+
+void exit_io_latency(struct task_struct *tsk);
+#else
+static inline void exit_io_latency(struct task_struct *tsk)
+{
+ ;
+}
+#endif
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1708,6 +1726,9 @@ struct task_struct {
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
+#ifdef CONFIG_SCHED_IO_LATENCY
+ struct io_latency_node io_latency;
+#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 596a0e007c62..2d73bd23e206 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,14 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+#ifdef CONFIG_SMP
+extern unsigned int sysctl_sched_energy_option;
+
+int sched_proc_energy_option_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
+
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
diff --git a/include/trace/events/io_latency.h b/include/trace/events/io_latency.h
new file mode 100644
index 000000000000..ab679fcd8d27
--- /dev/null
+++ b/include/trace/events/io_latency.h
@@ -0,0 +1,32 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM io_latency
+
+#if !defined(_TRACE_IO_LATENCY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_LATENCY_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(io_latency_entry,
+
+ TP_PROTO(u64 latency, u64 avg_latency),
+
+ TP_ARGS(latency, avg_latency),
+
+ TP_STRUCT__entry(
+ __field( u64, latency )
+ __field( u64, avg_latency )
+ ),
+
+ TP_fast_assign(
+ __entry->latency = latency;
+ __entry->avg_latency = avg_latency;
+ ),
+
+ TP_printk("latency=%llu, avg latency=%llu",
+ __entry->latency, __entry->avg_latency)
+);
+
+#endif /* _TRACE_IO_LATENCY_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/Kconfig b/init/Kconfig
index 3ee28ae02cc8..b849c0947dd0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1230,6 +1230,17 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_IO_LATENCY
+ bool "IO latency tracking for the scheduler"
+ depends on SMP
+ help
+ This option tracks for each task the io latency average time it has
+ been blocked on for each cpu. It helps to have more information
+ regarding how long a cpu will be idle and to take better scheduling
+ decision.
+
+ If unsure, say Y.
+
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..8e4e75d5efaa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -759,6 +759,7 @@ void do_exit(long code)
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread();
+ exit_io_latency(tsk);
/*
* Flush inherited counters to the parent - before the parent
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..13b5cbf53628 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
-
+#ifdef CONFIG_SCHED_IO_LATENCY
+ tsk->io_latency.avg_latency = 0;
+ INIT_LIST_HEAD(&tsk->io_latency.bucket_list);
+#endif
account_kernel_stack(ti, 1);
return tsk;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7cd404cd5608..5d5380090741 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,4 +19,6 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_IO_LATENCY) += io_latency.o
+obj-$(CONFIG_SCHED_IDLE_DEBUG) += idle_debug.o
obj-$(CONFIG_CPU_FREQ_GOV_ENERGY_MODEL) += energy_model.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02ec4a5b5fd3..4db493445cd6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,7 @@
#endif
#include "sched.h"
+#include "io_latency.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
@@ -4396,7 +4397,9 @@ void __sched io_schedule(void)
atomic_inc(&rq->nr_iowait);
blk_flush_plug(current);
current->in_iowait = 1;
+ io_latency_begin(rq, current);
schedule();
+ io_latency_end(rq, current);
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
@@ -4412,7 +4415,9 @@ long __sched io_schedule_timeout(long timeout)
atomic_inc(&rq->nr_iowait);
blk_flush_plug(current);
current->in_iowait = 1;
+ io_latency_begin(rq, current);
ret = schedule_timeout(timeout);
+ io_latency_end(rq, current);
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
@@ -7167,6 +7172,8 @@ void __init sched_init(void)
autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
+
+ io_latency_init();
for_each_possible_cpu(i) {
struct rq *rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 21043e9b31ae..5f3056215e41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -62,6 +62,10 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
enum sched_tunable_scaling sysctl_sched_tunable_scaling
= SCHED_TUNABLESCALING_LOG;
+#ifdef CONFIG_SMP
+unsigned int sysctl_sched_energy_option = 0; /* Experimental code, disabled by default */
+#endif
+
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..9fed4d593773 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,7 +4,8 @@
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
-#include <linux/tick.h>
+#include <linux/ktime.h>
+#include <linux/pm_qos.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
@@ -13,6 +14,8 @@
#include <trace/events/power.h>
#include "sched.h"
+#include "io_latency.h"
+#include "idle_debug.h"
static int __read_mostly cpu_idle_force_poll;
@@ -78,8 +81,9 @@ static void cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ struct cpuidle_times times;
int next_state, entered_state;
- unsigned int broadcast;
+ bool broadcast;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -103,11 +107,29 @@ static void cpuidle_idle_call(void)
*/
rcu_idle_enter();
+ times.latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+ /*
+ * The latency requirement does not allow any latency, jump to
+ * the default idle function without entering the cpuidle code
+ */
+ if (times.latency_req == 0)
+ goto use_default;
+
+ /*
+ * Retrieve the next timer event
+ */
+ times.next_timer_event = ktime_to_us(tick_nohz_get_sleep_length());
+
+ /*
+ * Retrieve the next IO guessed event
+ */
+ times.next_io_event = io_latency_get_sleep_length(this_rq());
+
/*
* Ask the cpuidle framework to choose a convenient idle state.
* Fall back to the default arch idle method on errors.
*/
- next_state = cpuidle_select(drv, dev);
+ next_state = cpuidle_select(drv, dev, &times);
if (next_state < 0) {
use_default:
/*
@@ -160,6 +182,11 @@ use_default:
/* The cpu is no longer idle or about to enter idle. */
idle_set_state(this_rq(), NULL);
+ /*
+ * Update the prediction rating
+ */
+ idle_debug_prediction_update(drv, dev, &times, entered_state);
+
if (broadcast)
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/idle_debug.c b/kernel/sched/idle_debug.c
new file mode 100644
index 000000000000..9cb17452729d
--- /dev/null
+++ b/kernel/sched/idle_debug.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/atomic.h>
+#include <linux/init.h>
+
+static atomic_t idle_predictions_under_estimate;
+static atomic_t idle_predictions_over_estimate;
+static atomic_t idle_predictions_success;
+
+void idle_debug_prediction_update(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times, int index)
+{
+ int residency, target_residency;
+ int i;
+
+ /*
+ * The cpuidle driver was not able to enter an idle state, the
+ * last_residency is then zero and it does not make sense to
+ * update the predictions accuracy.
+ */
+ residency = dev->last_residency;
+ if (!residency)
+ return;
+
+ target_residency = drv->states[index].target_residency;
+
+ /*
+ * The last residency is smaller than the target residency, we
+ * overestimated the sleep time.
+ */
+ if (residency < target_residency) {
+ atomic_inc(&idle_predictions_over_estimate);
+ return;
+ }
+
+ /*
+ * This state is not the deepest one, get the next time residency to
+ * check if we could have been deeper in idle.
+ */
+ for (i = index + 1; i < drv->state_count; i++) {
+
+ /*
+ * Ignore the disabled states
+ */
+ if (drv->states[i].disabled || dev->states_usage[i].disable)
+ continue;
+
+ /*
+ * Ignore the states which did not fit the latency
+ * constraint. As the idle states array is ordered, we
+ * know the deeper idle state will have a greater exit
+ * latency, so no need to continue the loop because
+ * none of next idle states will fit the latency
+ * requirement.
+ */
+ if (drv->states[i].exit_latency > times->latency_req)
+ break;
+
+ /*
+ * The residency is greater than the next state's
+ * target residency. We underestimate the sleep time
+ * and we could have been sleeping deeper.
+ */
+ if (residency > drv->states[i].target_residency) {
+ atomic_inc(&idle_predictions_under_estimate);
+ return;
+ }
+
+ /*
+ * No need to continue looking at the deeper idle
+ * state as their target residency will be greater
+ * than the last one we compare to.
+ */
+ break;
+ }
+
+ atomic_inc(&idle_predictions_success);
+}
+
+static int __init idle_debug(void)
+{
+ struct dentry *dsched, *didle;
+ int ret = -1;
+
+ dsched = debugfs_create_dir("sched", NULL);
+ if (!dsched)
+ return -1;
+
+ didle = debugfs_create_dir("idle", dsched);
+ if (!didle)
+ goto out;
+
+ if (!debugfs_create_atomic_t("predictions_under_estimate", 0600, didle,
+ &idle_predictions_under_estimate))
+ goto out;
+
+ if (!debugfs_create_atomic_t("predictions_over_estimate", 0600, didle,
+ &idle_predictions_over_estimate))
+ goto out;
+
+ if (!debugfs_create_atomic_t("predictions_success", 0600, didle,
+ &idle_predictions_success))
+ goto out;
+
+ ret = 0;
+out:
+ if (ret)
+ debugfs_remove_recursive(dsched);
+
+ return ret;
+}
+
+core_initcall(idle_debug)
diff --git a/kernel/sched/idle_debug.h b/kernel/sched/idle_debug.h
new file mode 100644
index 000000000000..3fca132d5fc3
--- /dev/null
+++ b/kernel/sched/idle_debug.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+struct cpuidle_device;
+struct cpuidle_driver;
+
+#ifdef CONFIG_SCHED_IDLE_DEBUG
+extern void idle_debug_prediction_update(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times,
+ int index);
+#else
+static inline void idle_debug_prediction_update(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev,
+ struct cpuidle_times *times,
+ int index)
+{
+ ;
+}
+#endif
diff --git a/kernel/sched/io_latency.c b/kernel/sched/io_latency.c
new file mode 100644
index 000000000000..4f902d0c626f
--- /dev/null
+++ b/kernel/sched/io_latency.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "sched.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/io_latency.h>
+
+struct io_latency_tree {
+ spinlock_t lock;
+ struct rb_root tree;
+ struct io_latency_node *left_most;
+};
+
+/*
+ * That represents the resolution of the statistics in usec, the latency
+ * for a bucket is BUCKET_INTERVAL * index.
+ * The higher the resolution is the lesser good prediction you will have.
+ * Some measurements:
+ *
+ * For 1ms:
+ * SSD 6Gb/s : 99.7%
+ * SD card class 10: 97.7%
+ * SD card class 4 : 54.3%
+ * HDD on USB : 93.6%
+ *
+ * For 500us:
+ * SSD 6Gb/s : 99.9%
+ * SD card class 10 : 96.8%
+ * SD card class 4 : 55.8%
+ * HDD on USB : 86.3%
+ *
+ * For 200us:
+ * SSD 6Gb/s : 99.7%
+ * SD card class 10 : 95.5%
+ * SD card class 4 : 29.5%
+ * HDD on USB : 66.3%
+ *
+ * For 100us:
+ * SSD 6Gb/s : 85.7%
+ * SD card class 10 : 67.63%
+ * SD card class 4 : 31.4%
+ * HDD on USB : 44.97%
+ *
+ * Aiming a 100% is not necessary good because we want to hit the correct
+ * idle state. Setting a low resolution will group the different latencies
+ * into a big interval which may overlap with the cpuidle state target
+ * residency.
+ *
+ */
+#define BUCKET_INTERVAL 200
+
+/*
+ * Number of successive hits for the same bucket. That is the thresold
+ * triggering the move of the element at the beginning of the list, so
+ * becoming more weighted for the statistics when guessing for the next
+ * latency.
+ */
+#define BUCKET_SUCCESSIVE 5
+
+/*
+ * What is a bucket ?
+ *
+ * A bucket is an interval of latency. This interval is defined with the
+ * BUCKET_INTERVAL. The bucket index gives what latency interval we have.
+ * For example, if you have an index 2 and a bucket interval of 1000usec,
+ * then the bucket contains the latencies 2000 and 2999 usec.
+ *
+ */
+struct bucket {
+ int hits;
+ int successive_hits;
+ int index;
+ int average;
+ struct list_head list;
+};
+
+static struct kmem_cache *bucket_cachep;
+
+static DEFINE_PER_CPU(struct io_latency_tree, latency_trees);
+
+/**
+ * io_latency_bucket_find - Find a bucket associated with the specified index
+ *
+ * @index: the index of the bucket to find
+ * @tsk: the task to retrieve the task list
+ *
+ * Returns the bucket associated with the index, NULL if no bucket is found
+ */
+static struct bucket *io_latency_bucket_find(struct task_struct *tsk, int index)
+{
+ struct list_head *list;
+ struct bucket *bucket = NULL;
+ struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+
+ list_for_each(list, bucket_list) {
+
+ bucket = list_entry(list, struct bucket, list);
+
+ if (bucket->index == index)
+ return bucket;
+ }
+
+ return NULL;
+}
+
+/**
+ * io_latency_bucket_alloc - Allocate a bucket
+ *
+ * @index: index of the bucket to allow
+ *
+ * Allocate and initialize a bucket structure
+ *
+ * Returns a pointer to a bucket or NULL is the allocation failed
+ */
+static struct bucket *io_latency_bucket_alloc(int index)
+{
+ struct bucket *bucket;
+
+ bucket = kmem_cache_alloc(bucket_cachep, GFP_KERNEL);
+ if (bucket) {
+ bucket->hits = 0;
+ bucket->successive_hits = 0;
+ bucket->index = index;
+ bucket->average = 0;
+ INIT_LIST_HEAD(&bucket->list);
+ }
+
+ return bucket;
+}
+
+/**
+ * io_latency_guessed_bucket - try to predict the next bucket
+ *
+ * @tsk: the task to get the bucket list
+ *
+ * The list is ordered by history. The first element is the one with
+ * the more *successive* hits. This function is called each time a new
+ * latency is inserted. The algorithm is pretty simple here: As the
+ * first element is the one which more chance to occur next, its
+ * weight is the bigger, the second one has less weight, etc ...
+ *
+ * The bucket which has the maximum score (number of hits weighted by
+ * its position in the list) is the next bucket which has more chances
+ * to occur.
+ *
+ * Returns a pointer to the bucket structure, NULL if there are no
+ * buckets in the list
+ */
+static struct bucket *io_latency_guessed_bucket(struct task_struct *tsk)
+{
+ int weight = 0;
+ int score, score_max = 0;
+ struct bucket *bucket, *winner = NULL;
+ struct list_head *list = NULL;
+ struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+
+ if (list_empty(bucket_list))
+ return NULL;
+
+ list_for_each(list, bucket_list) {
+
+ bucket = list_entry(list, struct bucket, list);
+
+ /*
+ * The list is ordered by history, the first element has
+ * more weight the next one
+ */
+ score = bucket->hits / ((2 * weight) + 1);
+
+ weight++;
+
+ if (score < score_max)
+ continue;
+
+ score_max = score;
+ winner = bucket;
+ }
+
+ return winner;
+}
+
+/*
+ * io_latency_bucket_index - Returns the bucket index for the specified latency
+ *
+ * @latency: the latency fitting a bucket with the specified index
+ *
+ * Returns an integer for the bucket's index
+ */
+static int io_latency_bucket_index(int latency)
+{
+ return latency / BUCKET_INTERVAL;
+}
+
+/*
+ * io_latency_bucket_fill - Compute and fill the bucket list
+ *
+ * @tsk: the task completing an IO
+ * @latency: the latency of the IO
+ *
+ * The dynamic of the list is the following.
+ * - Each new element is inserted at the end of the list
+ * - Each element passing <BUCKET_SUCCESSIVE> times in this function
+ * is elected to be moved at the beginning at the list
+ *
+ * Returns 0 on success, -1 if a bucket allocation failed
+ */
+static int io_latency_bucket_fill(struct task_struct *tsk, int latency)
+{
+ int diff, index = io_latency_bucket_index(latency);
+ struct bucket *bucket;
+
+ /*
+ * Find the bucket associated with the index
+ */
+ bucket = io_latency_bucket_find(tsk, index);
+ if (!bucket) {
+ bucket = io_latency_bucket_alloc(index);
+ if (!bucket)
+ return -1;
+
+ list_add_tail(&bucket->list, &tsk->io_latency.bucket_list);
+ }
+
+ /*
+ * Increase the number of times this bucket has been hit
+ */
+ bucket->hits++;
+ bucket->successive_hits++;
+
+ /*
+ * Compute a sliding average for latency in this bucket
+ */
+ diff = latency - bucket->average;
+ bucket->average += (diff >> 6);
+
+ /*
+ * We hit a successive number of times the same bucket, move
+ * it at the beginning of the list
+ */
+ if (bucket->successive_hits == BUCKET_SUCCESSIVE) {
+ list_move(&bucket->list, &tsk->io_latency.bucket_list);
+ bucket->successive_hits = 1;
+ }
+
+ return 0;
+}
+
+/*
+ * exit_io_latency - free ressources when the task exits
+ *
+ * @tsk : the exiting task
+ *
+ */
+void exit_io_latency(struct task_struct *tsk)
+{
+ struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+ struct list_head *tmp, *list;
+ struct bucket *bucket;
+
+ list_for_each_safe(list, tmp, bucket_list) {
+
+ list_del(list);
+ bucket = list_entry(list, struct bucket, list);
+ kmem_cache_free(bucket_cachep, bucket);
+ }
+}
+
+/**
+ * io_latency_init : initialization routine
+ *
+ * Initializes the cache pool and the io latency rb trees.
+ */
+void io_latency_init(void)
+{
+ int cpu;
+ struct io_latency_tree *latency_tree;
+ struct rb_root *root;
+
+ bucket_cachep = KMEM_CACHE(bucket, SLAB_PANIC);
+
+ for_each_possible_cpu(cpu) {
+ latency_tree = &per_cpu(latency_trees, cpu);
+ latency_tree->left_most = NULL;
+ spin_lock_init(&latency_tree->lock);
+ root = &latency_tree->tree;
+ root->rb_node = NULL;
+ }
+}
+
+/**
+ * io_latency_get_sleep_length: compute the expected sleep time
+ *
+ * @rq: the runqueue associated with the cpu
+ *
+ * Returns the minimal estimated remaining sleep time for the pending IOs
+ */
+s64 io_latency_get_sleep_length(struct rq *rq)
+{
+ int cpu = rq->cpu;
+ struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+ struct io_latency_node *node;
+ ktime_t now = ktime_get();
+ s64 diff;
+
+ node = latency_tree->left_most;
+
+ if (!node)
+ return 0;
+
+ diff = ktime_to_us(ktime_sub(now, node->start_time));
+ diff = node->avg_latency - diff;
+
+ /* Estimation was wrong, return 0 */
+ if (diff < 0)
+ return 0;
+
+ return diff;
+}
+
+/**
+ * io_latency_avg: compute the io latency sliding average value
+ *
+ * @node: a rb tree node belonging to a task
+ *
+ */
+static void io_latency_avg(struct task_struct *tsk)
+{
+ struct io_latency_node *node = &tsk->io_latency;
+ s64 latency = ktime_to_us(ktime_sub(node->end_time, node->start_time));
+ struct bucket *bucket;
+
+ io_latency_bucket_fill(tsk, latency);
+
+ bucket = io_latency_guessed_bucket(tsk);
+ if (bucket)
+ node->avg_latency = bucket->average;
+}
+
+/**
+ * io_latency_begin - insert the node in the rb tree
+ *
+ * @rq: the runqueue the task is running on
+ * @task: the task being blocked on an IO
+ *
+ * Inserts the node in the rbtree in an ordered manner. If this task
+ * has the minimal io latency of all the tasks blocked on IO, it falls
+ * at the left most node and a shortcut is used. Stores the start
+ * time of the io schedule.
+ *
+ */
+int io_latency_begin(struct rq *rq, struct task_struct *tsk)
+{
+ int cpu = rq->cpu;
+ struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+ struct rb_root *root = &latency_tree->tree;
+ struct io_latency_node *node = &tsk->io_latency;
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+ struct io_latency_node *lat;
+ int leftmost = 1;
+
+ node->start_time = ktime_get();
+
+ spin_lock(&latency_tree->lock);
+
+ while (*new) {
+ lat = rb_entry(*new, struct io_latency_node, node);
+
+ parent = *new;
+
+ if (lat->avg_latency > node->avg_latency)
+ new = &parent->rb_left;
+ else {
+ new = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ latency_tree->left_most = node;
+
+ rb_link_node(&node->node, parent, new);
+ rb_insert_color(&node->node, root);
+
+ spin_unlock(&latency_tree->lock);
+
+ return 0;
+}
+
+/**
+ * io_latency_end - Removes the node from the rb tree
+ *
+ * @rq: the runqueue the task belongs to
+ * @tsk: the task woken up after an IO completion
+ *
+ * Removes the node for the rb tree for this cpu. Update the left most
+ * node with the next node if itself it is the left most
+ * node. Retrieves the end time after the io has complete and update
+ * the io latency average time
+ */
+void io_latency_end(struct rq *rq, struct task_struct *tsk)
+{
+ int cpu = rq->cpu;
+ struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+ struct rb_root *root = &latency_tree->tree;
+ struct io_latency_node *old = &tsk->io_latency;
+
+ old->end_time = ktime_get();
+
+ spin_lock(&latency_tree->lock);
+
+ if (latency_tree->left_most == old) {
+ struct rb_node *next_node =
+ rb_next(&latency_tree->left_most->node);
+ latency_tree->left_most =
+ rb_entry(next_node, struct io_latency_node, node);
+ }
+
+ rb_erase(&old->node, root);
+
+ spin_unlock(&latency_tree->lock);
+
+ io_latency_avg(tsk);
+
+ trace_io_latency_entry(
+ ktime_to_us(ktime_sub(old->end_time, old->start_time)),
+ old->avg_latency);
+}
diff --git a/kernel/sched/io_latency.h b/kernel/sched/io_latency.h
new file mode 100644
index 000000000000..abe063ee6417
--- /dev/null
+++ b/kernel/sched/io_latency.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+#ifdef CONFIG_SCHED_IO_LATENCY
+extern void io_latency_init(void);
+extern int io_latency_begin(struct rq *rq, struct task_struct *tsk);
+extern void io_latency_end(struct rq *rq, struct task_struct *tsk);
+extern int io_latency_get_sleep_length(struct rq *rq);
+#else
+static inline void io_latency_init(void)
+{
+ ;
+}
+
+static inline int io_latency_begin(struct rq *rq, struct task_struct *tsk)
+{
+ return 0;
+}
+
+static inline void io_latency_end(struct rq *rq, struct task_struct *tsk)
+{
+ ;
+}
+
+static inline int io_latency_get_sleep_length(struct rq *rq)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..947663d2935a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -284,6 +284,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SMP
+ {
+ .procname = "sched_energy_option",
+ .data = &sysctl_sched_energy_option,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4e35a5d767ed..6e5b5ba795db 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -812,6 +812,15 @@ config SCHED_DEBUG
that can help debug the scheduler. The runtime overhead of this
option is minimal.
+config SCHED_IDLE_DEBUG
+ bool "Collect idle prediction accuracy"
+ depends on CPU_IDLE && DEBUG_FS
+ default n
+ help
+ If you say Y here, the /sys/kernel/debug/sched/idle directory
+ will be provided with some idle predictions statistics success.
+ The overhead is negligeable
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS