25 files changed, 1052 insertions, 33 deletions
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa..d193609beabc 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -31,6 +31,16 @@
 #include <asm/apic.h>
 #include <asm/nmi.h>
 #include <asm/trace/irq_vectors.h>
+
+#define CREATE_TRACE_POINTS
+/*
+ * Those were defined in <asm/trace/irq_vectors.h> and cause problems
+ * when including <trace/events/ipi.h>.
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#include <trace/events/ipi.h>
+
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -124,11 +134,13 @@ static void native_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
+	trace_ipi_raise(cpumask_of(cpu), tracepoint_string("RESCHEDULE"));
 	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
+	trace_ipi_raise(cpumask_of(cpu), tracepoint_string("CALL_FUNCTION_SINGLE"));
 	apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
 
@@ -136,6 +148,8 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 {
 	cpumask_var_t allbutself;
 
+	trace_ipi_raise(mask, tracepoint_string("CALL_FUNCTION"));
+
 	if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
 		apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
 		return;
@@ -252,8 +266,10 @@ finish:
  */
 static inline void __smp_reschedule_interrupt(void)
 {
+	trace_ipi_entry(tracepoint_string("RESCHEDULE"));
 	inc_irq_stat(irq_resched_count);
 	scheduler_ipi();
+	trace_ipi_exit(tracepoint_string("RESCHEDULE"));
 }
 
 __visible void smp_reschedule_interrupt(struct pt_regs *regs)
@@ -291,8 +307,10 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 
 static inline void __smp_call_function_interrupt(void)
 {
+	trace_ipi_entry(tracepoint_string("CALL_FUNCTION"));
 	generic_smp_call_function_interrupt();
 	inc_irq_stat(irq_call_count);
+	trace_ipi_exit(tracepoint_string("CALL_FUNCTION"));
 }
 
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
@@ -313,8 +331,10 @@ __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 
 static inline void __smp_call_function_single_interrupt(void)
 {
+	trace_ipi_entry(tracepoint_string("CALL_FUNCTION_SINGLE"));
 	generic_smp_call_function_single_interrupt();
 	inc_irq_stat(irq_call_count);
+	trace_ipi_exit(tracepoint_string("CALL_FUNCTION_SINGLE"));
 }
 
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c5029c1209b4..6deb2473c69c 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -25,6 +25,10 @@ config CPU_IDLE_GOV_MENU
 	bool "Menu governor (for tickless system)"
 	default y
 
+config CPU_IDLE_GOV_SELECT
+	bool "Select governor (for tickless system)"
+	default y
+
 config DT_IDLE_STATES
 	bool
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150dc6e81..a79c4db22ffa 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,16 +8,12 @@
  * This code is licenced under the GPL.
  */
 
-#include <linux/clockchips.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <trace/events/power.h>
 
@@ -122,6 +118,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
+	target_state->idle_start = ktime_to_us(time_start);
+
 	entered_state = target_state->enter(dev, drv, index);
 
 	time_end = ktime_get();
@@ -134,6 +132,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	if (diff > INT_MAX)
 		diff = INT_MAX;
 
+	target_state->idle_start = 0;
+
 	dev->last_residency = (int) diff;
 
 	if (entered_state >= 0) {
@@ -143,6 +143,14 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		 */
 		dev->states_usage[entered_state].time += dev->last_residency;
 		dev->states_usage[entered_state].usage++;
+		if (diff < dev->last_residency)
+			atomic_inc(&dev->over_estimate);
+		else if (entered_state < (drv->state_count - 1) &&
+			 dev->last_residency <
+			 drv->states[entered_state + 1].target_residency)
+			atomic_inc(&dev->under_estimate);
+		else
+			atomic_inc(&dev->right_estimate);
 	} else {
 		dev->last_residency = 0;
 	}
@@ -158,7 +166,8 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
  *
  * Returns the index of the idle state.
  */
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		   struct cpuidle_times *times)
 {
 	if (off || !initialized)
 		return -ENODEV;
@@ -169,7 +178,7 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	if (unlikely(use_deepest_state))
 		return cpuidle_find_deepest_state(drv, dev);
 
-	return cpuidle_curr_governor->select(drv, dev);
+	return cpuidle_curr_governor->select(drv, dev, times);
 }
 
 /**
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 1b512722689f..fa455206fede 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
 obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_SELECT) += select.o
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 06b57c4c4d80..4f5ec8ade22e 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -64,18 +64,13 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
  * @dev: the CPU
  */
 static int ladder_select_state(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       struct cpuidle_times *times)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct ladder_device_state *last_state;
+	int latency_req = times->latency_req;
 	int last_residency, last_idx = ldev->last_state_idx;
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-
-	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0)) {
-		ladder_do_selection(ldev, last_idx, 0);
-		return 0;
-	}
 
 	last_state = &ldev->states[last_idx];
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 710a233b9b0d..e360b08ea44a 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -13,10 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/pm_qos.h>
-#include <linux/time.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
 #include <linux/module.h>
@@ -287,10 +283,11 @@ again:
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
  */
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       struct cpuidle_times *times)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	int latency_req = times->latency_req;
 	int i;
 	unsigned int interactivity_req;
 	unsigned long nr_iowaiters, cpu_load;
@@ -302,12 +299,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 	data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 
-	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0))
-		return 0;
-
 	/* determine the expected residency time, round up */
-	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+	data->next_timer_us = times->next_timer_event;
 
 	get_iowait_load(&nr_iowaiters, &cpu_load);
 	data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
diff --git a/drivers/cpuidle/governors/select.c b/drivers/cpuidle/governors/select.c
new file mode 100644
index 000000000000..2193b7870b34
--- /dev/null
+++ b/drivers/cpuidle/governors/select.c
@@ -0,0 +1,55 @@
+/*
+ * select.c - the select governor
+ *
+ * Copyright (C) 2014 Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+*/
+
+#include <linux/cpuidle.h>
+
+static int select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		  struct cpuidle_times *times)
+{
+	int i, index = 0, latency_req = times->latency_req;
+	unsigned int next_event;
+
+	/*
+	 * If the guessed IO next event is zero, that means there is no IO
+	 * pending, so we ignore it in the equation
+	 */
+	next_event = times->next_io_event ? 
+		min(times->next_io_event, times->next_timer_event) :
+		times->next_timer_event;
+
+	for (i = 0; i < drv->state_count; i++) {
+
+		struct cpuidle_state *s = &drv->states[i];
+		struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+		if (s->disabled || su->disable)
+			continue;
+		if (s->target_residency > next_event)
+			continue;
+		if (s->exit_latency > latency_req)
+			continue;
+
+		index = i;
+	}
+
+	return index;
+}
+
+static struct cpuidle_governor select_governor = {
+	.name   = "select",
+	.rating = 30,
+	.select = select,
+	.owner  = THIS_MODULE,
+};
+
+static int __init select_init(void)
+{
+	return cpuidle_register_governor(&select_governor);
+}
+
+postcore_initcall(select_init);
+
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 97c5903b4606..f446bd0fd9bd 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -439,6 +439,154 @@ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 		cpuidle_free_state_kobj(device, i);
 }
 
+#define kobj_to_stats_kobj(k) container_of(k, struct cpuidle_stats_kobj, kobj)
+#define attr_to_stats_attr(a) container_of(a, struct cpuidle_stats_attr, attr)
+
+#define define_show_stats_function(_name)				\
+	static ssize_t show_stats_##_name(struct cpuidle_device *dev,	\
+					  char *buf)			\
+	{								\
+		return sprintf(buf, "%d\n", atomic_read(&dev->_name));	\
+	}
+
+#define define_store_stats_function(_name)				\
+	static ssize_t store_stats_##_name(struct cpuidle_device *dev,	\
+					   const char *buf, size_t size) \
+	{								\
+		unsigned long long value;				\
+		int err;						\
+		if (!capable(CAP_SYS_ADMIN))				\
+			return -EPERM;					\
+		err = kstrtoull(buf, 0, &value);			\
+		if (err)						\
+			return err;					\
+									\
+		atomic_set(&dev->_name, value);				\
+		return size;						\
+	}
+
+#define define_one_stats_rw(_name, show, store)		       \
+	static struct cpuidle_stats_attr attr_stats_##_name = \
+		__ATTR(_name, 0644, show, store)
+
+struct cpuidle_stats_kobj {
+	struct cpuidle_device *dev;
+	struct completion kobj_unregister;
+	struct kobject kobj;
+};
+
+struct cpuidle_stats_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpuidle_device *, char *);
+	ssize_t (*store)(struct cpuidle_device *, const char *, size_t);
+};
+
+static void cpuidle_stats_sysfs_release(struct kobject *kobj)
+{
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	complete(&stats_kobj->kobj_unregister);
+}
+
+static ssize_t cpuidle_stats_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	int ret = -EIO;
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+	if (dattr->show)
+		ret = dattr->show(stats_kobj->dev, buf);
+
+	return ret;
+}
+
+static ssize_t cpuidle_stats_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buf, size_t size)
+{
+	int ret = -EIO;
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+	if (dattr->store)
+		ret = dattr->store(stats_kobj->dev, buf, size);
+
+	return ret;
+}
+
+define_show_stats_function(right_estimate);
+define_store_stats_function(right_estimate);
+
+define_show_stats_function(under_estimate);
+define_store_stats_function(under_estimate);
+
+define_show_stats_function(over_estimate);
+define_store_stats_function(over_estimate);
+
+define_one_stats_rw(right_estimate,
+		    show_stats_right_estimate,
+		    store_stats_right_estimate);
+
+define_one_stats_rw(under_estimate,
+		    show_stats_under_estimate,
+		    store_stats_under_estimate);
+
+define_one_stats_rw(over_estimate,
+		    show_stats_over_estimate,
+		    store_stats_over_estimate);
+
+static const struct sysfs_ops cpuidle_stats_sysfs_ops = {
+	.show = cpuidle_stats_show,
+	.store = cpuidle_stats_store,
+};
+
+static struct attribute *cpuidle_stats_default_attrs[] = {
+	&attr_stats_right_estimate.attr,
+	&attr_stats_under_estimate.attr,
+	&attr_stats_over_estimate.attr,
+	NULL
+};
+
+static struct kobj_type ktype_stats_cpuidle = {
+	.sysfs_ops = &cpuidle_stats_sysfs_ops,
+	.default_attrs = cpuidle_stats_default_attrs,
+	.release = cpuidle_stats_sysfs_release,
+};
+
+static int cpuidle_add_stats_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_stats_kobj *kstats;
+	struct cpuidle_device_kobj *kdev = dev->kobj_dev;
+	int ret;
+
+	kstats = kzalloc(sizeof(*kstats), GFP_KERNEL);
+	if (!kstats)
+		return -ENOMEM;
+
+	kstats->dev = dev;
+	init_completion(&kstats->kobj_unregister);
+
+	ret = kobject_init_and_add(&kstats->kobj, &ktype_stats_cpuidle,
+				   &kdev->kobj, "stats");
+	if (ret) {
+		kfree(kstats);
+		return ret;
+	}
+
+	kobject_uevent(&kstats->kobj, KOBJ_ADD);
+	dev->kobj_stats = kstats;
+
+	return ret;
+}
+
+static void cpuidle_remove_stats_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_stats_kobj *kstats = dev->kobj_stats;
+	kobject_put(&kstats->kobj);
+	wait_for_completion(&kstats->kobj_unregister);
+	kfree(kstats);
+}
+
 #ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
 #define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj)
 #define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr)
@@ -589,6 +737,13 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
 	ret = cpuidle_add_driver_sysfs(device);
 	if (ret)
 		cpuidle_remove_state_sysfs(device);
+
+	ret = cpuidle_add_stats_sysfs(device);
+	if (ret) {
+		cpuidle_remove_driver_sysfs(device);
+		cpuidle_remove_state_sysfs(device);
+	}
+
 	return ret;
 }
 
@@ -598,6 +753,7 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
  */
 void cpuidle_remove_device_sysfs(struct cpuidle_device *device)
 {
+	cpuidle_remove_stats_sysfs(device);
 	cpuidle_remove_driver_sysfs(device);
 	cpuidle_remove_state_sysfs(device);
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 25e0df6155a4..3ac0ded7d75a 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -44,7 +44,7 @@ struct cpuidle_state {
 	int		power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
 	bool		disabled; /* disabled on all CPUs */
-
+	s64             idle_start;
 	int (*enter)	(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
 			int index);
@@ -62,6 +62,7 @@ struct cpuidle_state {
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
 struct cpuidle_driver_kobj;
+struct cpuidle_stats_kobj;
 
 struct cpuidle_device {
 	unsigned int		registered:1;
@@ -74,8 +75,13 @@ struct cpuidle_device {
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 	struct cpuidle_driver_kobj *kobj_driver;
 	struct cpuidle_device_kobj *kobj_dev;
+	struct cpuidle_stats_kobj  *kobj_stats;
 	struct list_head 	device_list;
 
+	atomic_t right_estimate;
+	atomic_t under_estimate;
+	atomic_t over_estimate;
+
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 	int			safe_state_index;
 	cpumask_t		coupled_cpus;
@@ -83,6 +89,12 @@ struct cpuidle_device {
 #endif
 };
 
+struct cpuidle_times {
+	unsigned int latency_req;
+	unsigned int next_timer_event;
+	unsigned int next_io_event;
+};
+
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
 DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 
@@ -122,7 +134,8 @@ struct cpuidle_driver {
 extern void disable_cpuidle(void);
 
 extern int cpuidle_select(struct cpuidle_driver *drv,
-			  struct cpuidle_device *dev);
+			  struct cpuidle_device *dev,
+			  struct cpuidle_times *times);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
 			 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -150,7 +163,8 @@ extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)
 #else
 static inline void disable_cpuidle(void) { }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
-				 struct cpuidle_device *dev)
+				 struct cpuidle_device *dev,
+				 struct cpuidle_times *times)
 {return -ENODEV; }
 static inline int cpuidle_enter(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
@@ -205,7 +219,8 @@ struct cpuidle_governor {
 					struct cpuidle_device *dev);
 
 	int  (*select)		(struct cpuidle_driver *drv,
-					struct cpuidle_device *dev);
+				 struct cpuidle_device *dev,
+				 struct cpuidle_times *times);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 
 	struct module 		*owner;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a100d8713..fc3a7cf107ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1284,6 +1284,24 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+
+#ifdef CONFIG_SCHED_IO_LATENCY
+struct io_latency_node {
+	struct rb_node node;
+	unsigned int avg_latency;
+	ktime_t start_time;
+	ktime_t end_time;
+	struct list_head bucket_list;
+};
+
+void exit_io_latency(struct task_struct *tsk);
+#else
+static inline void exit_io_latency(struct task_struct *tsk)
+{
+	;
+}
+#endif
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1708,6 +1726,9 @@ struct task_struct {
 	unsigned int	sequential_io;
 	unsigned int	sequential_io_avg;
 #endif
+#ifdef CONFIG_SCHED_IO_LATENCY
+	struct io_latency_node io_latency;
+#endif
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	unsigned long	task_state_change;
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 596a0e007c62..2d73bd23e206 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,14 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+#ifdef CONFIG_SMP
+extern unsigned int sysctl_sched_energy_option;
+
+int sched_proc_energy_option_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos);
+#endif
+
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
diff --git a/include/trace/events/io_latency.h b/include/trace/events/io_latency.h
new file mode 100644
index 000000000000..ab679fcd8d27
--- /dev/null
+++ b/include/trace/events/io_latency.h
@@ -0,0 +1,32 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM io_latency
+
+#if !defined(_TRACE_IO_LATENCY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_LATENCY_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(io_latency_entry,
+
+	TP_PROTO(u64 latency, u64 avg_latency),
+
+	TP_ARGS(latency, avg_latency),
+
+	TP_STRUCT__entry(
+		__field(	u64,		latency		)
+		__field(	u64,		avg_latency	)
+	),
+
+	TP_fast_assign(
+		__entry->latency = latency;
+		__entry->avg_latency = avg_latency;
+	),
+
+       TP_printk("latency=%llu, avg latency=%llu",
+		 __entry->latency, __entry->avg_latency)
+);
+
+#endif /* _TRACE_IO_LATENCY_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/Kconfig b/init/Kconfig
index 3ee28ae02cc8..b849c0947dd0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1230,6 +1230,17 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config SCHED_IO_LATENCY
+	bool "IO latency tracking for the scheduler"
+	depends on SMP
+	help
+	  This option tracks for each task the io latency average time it has
+	  been blocked on for each cpu. It helps to have more information
+	  regarding how long a cpu will be idle and to take better scheduling
+	  decision.
+
+	  If unsure, say Y.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..8e4e75d5efaa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -759,6 +759,7 @@ void do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread();
+	exit_io_latency(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..13b5cbf53628 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 #endif
 	tsk->splice_pipe = NULL;
 	tsk->task_frag.page = NULL;
-
+#ifdef CONFIG_SCHED_IO_LATENCY
+	tsk->io_latency.avg_latency = 0;
+	INIT_LIST_HEAD(&tsk->io_latency.bucket_list);
+#endif
 	account_kernel_stack(ti, 1);
 
 	return tsk;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7cd404cd5608..5d5380090741 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,4 +19,6 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_IO_LATENCY) += io_latency.o
+obj-$(CONFIG_SCHED_IDLE_DEBUG) += idle_debug.o
 obj-$(CONFIG_CPU_FREQ_GOV_ENERGY_MODEL) += energy_model.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02ec4a5b5fd3..4db493445cd6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,7 @@
 #endif
 
 #include "sched.h"
+#include "io_latency.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
@@ -4396,7 +4397,9 @@ void __sched io_schedule(void)
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
+	io_latency_begin(rq, current);
 	schedule();
+	io_latency_end(rq, current);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
@@ -4412,7 +4415,9 @@ long __sched io_schedule_timeout(long timeout)
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
+	io_latency_begin(rq, current);
 	ret = schedule_timeout(timeout);
+	io_latency_end(rq, current);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
@@ -7167,6 +7172,8 @@ void __init sched_init(void)
 	autogroup_init(&init_task);
 
 #endif /* CONFIG_CGROUP_SCHED */
+	
+	io_latency_init();
 
 	for_each_possible_cpu(i) {
 		struct rq *rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 21043e9b31ae..5f3056215e41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -62,6 +62,10 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 enum sched_tunable_scaling sysctl_sched_tunable_scaling
 	= SCHED_TUNABLESCALING_LOG;
 
+#ifdef CONFIG_SMP
+unsigned int sysctl_sched_energy_option = 0; /* Experimental code, disabled by default */
+#endif
+
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..9fed4d593773 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,7 +4,8 @@
 #include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
-#include <linux/tick.h>
+#include <linux/ktime.h>
+#include <linux/pm_qos.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
 
@@ -13,6 +14,8 @@
 #include <trace/events/power.h>
 
 #include "sched.h"
+#include "io_latency.h"
+#include "idle_debug.h"
 
 static int __read_mostly cpu_idle_force_poll;
 
@@ -78,8 +81,9 @@ static void cpuidle_idle_call(void)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	struct cpuidle_times times;
 	int next_state, entered_state;
-	unsigned int broadcast;
+	bool broadcast;
 
 	/*
 	 * Check if the idle task must be rescheduled. If it is the
@@ -103,11 +107,29 @@ static void cpuidle_idle_call(void)
 	 */
 	rcu_idle_enter();
 
+	times.latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	/*
+	 * The latency requirement does not allow any latency, jump to
+	 * the default idle function without entering the cpuidle code
+	 */
+	if (times.latency_req == 0)
+		goto use_default;
+
+	/*
+	 * Retrieve the next timer event
+	 */
+	times.next_timer_event = ktime_to_us(tick_nohz_get_sleep_length());
+
+	/*
+	 * Retrieve the next IO guessed event 
+	 */
+	times.next_io_event = io_latency_get_sleep_length(this_rq());
+
 	/*
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
 	 */
-	next_state = cpuidle_select(drv, dev);
+	next_state = cpuidle_select(drv, dev, &times);
 	if (next_state < 0) {
 use_default:
 		/*
@@ -160,6 +182,11 @@ use_default:
 	/* The cpu is no longer idle or about to enter idle. */
 	idle_set_state(this_rq(), NULL);
 
+	/*
+	 * Update the prediction rating
+	 */
+	idle_debug_prediction_update(drv, dev, &times, entered_state);
+
 	if (broadcast)
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
 
diff --git a/kernel/sched/idle_debug.c b/kernel/sched/idle_debug.c
new file mode 100644
index 000000000000..9cb17452729d
--- /dev/null
+++ b/kernel/sched/idle_debug.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/atomic.h>
+#include <linux/init.h>
+
+static atomic_t idle_predictions_under_estimate;
+static atomic_t idle_predictions_over_estimate;
+static atomic_t idle_predictions_success;
+
+void idle_debug_prediction_update(struct cpuidle_driver *drv,
+				  struct cpuidle_device *dev,
+				  struct cpuidle_times *times, int index)
+{
+	int residency, target_residency;
+	int i;
+
+	/*
+	 * The cpuidle driver was not able to enter an idle state, the
+	 * last_residency is then zero and it does not make sense to
+	 * update the predictions accuracy.
+	 */
+	residency = dev->last_residency;
+	if (!residency)
+		return;
+
+	target_residency = drv->states[index].target_residency;
+
+	/*
+	 * The last residency is smaller than the target residency, we
+	 * overestimated the sleep time.
+	 */
+	if (residency < target_residency) {
+		atomic_inc(&idle_predictions_over_estimate);
+		return;
+	}
+
+	/*
+	 * This state is not the deepest one, get the next time residency to
+	 * check if we could have been deeper in idle.
+	 */
+	for (i = index + 1; i < drv->state_count; i++) {
+
+		/* 
+		 * Ignore the disabled states
+		 */
+		if (drv->states[i].disabled || dev->states_usage[i].disable)
+			continue;
+
+		/*
+		 * Ignore the states which did not fit the latency
+		 * constraint. As the idle states array is ordered, we
+		 * know the deeper idle state will have a greater exit
+		 * latency, so no need to continue the loop because
+		 * none of next idle states will fit the latency
+		 * requirement.
+		 */
+		if (drv->states[i].exit_latency > times->latency_req)
+			break;
+
+		/*
+		 * The residency is greater than the next state's
+		 * target residency. We underestimate the sleep time
+		 * and we could have been sleeping deeper.
+		 */
+		if (residency > drv->states[i].target_residency) {
+			atomic_inc(&idle_predictions_under_estimate);
+			return;
+		}
+
+		/*
+		 * No need to continue looking at the deeper idle
+		 * state as their target residency will be greater
+		 * than the last one we compare to.
+		 */
+		break;
+	}
+
+	atomic_inc(&idle_predictions_success);
+}
+
+static int __init idle_debug(void)
+{
+	struct dentry *dsched, *didle;
+	int ret = -1;
+	
+	dsched = debugfs_create_dir("sched", NULL);
+	if (!dsched)
+		return -1;
+
+	didle = debugfs_create_dir("idle", dsched);
+	if (!didle)
+		goto out;
+
+	if (!debugfs_create_atomic_t("predictions_under_estimate", 0600, didle,
+				     &idle_predictions_under_estimate))
+		goto out;
+
+	if (!debugfs_create_atomic_t("predictions_over_estimate", 0600, didle,
+				     &idle_predictions_over_estimate))
+		goto out;
+
+	if (!debugfs_create_atomic_t("predictions_success", 0600, didle,
+				     &idle_predictions_success))
+		goto out;
+
+	ret = 0;
+out:
+	if (ret)
+		debugfs_remove_recursive(dsched);
+
+	return ret;
+}
+
+core_initcall(idle_debug)
diff --git a/kernel/sched/idle_debug.h b/kernel/sched/idle_debug.h
new file mode 100644
index 000000000000..3fca132d5fc3
--- /dev/null
+++ b/kernel/sched/idle_debug.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+struct cpuidle_device;
+struct cpuidle_driver;
+
+#ifdef CONFIG_SCHED_IDLE_DEBUG
+extern void idle_debug_prediction_update(struct cpuidle_driver *drv,
+					 struct cpuidle_device *dev,
+					 struct cpuidle_times  *times,
+					 int index);
+#else
+static inline void idle_debug_prediction_update(struct cpuidle_driver *drv,
+						struct cpuidle_device *dev,
+						struct cpuidle_times  *times,
+						int index)
+{
+	;
+}
+#endif
diff --git a/kernel/sched/io_latency.c b/kernel/sched/io_latency.c
new file mode 100644
index 000000000000..4f902d0c626f
--- /dev/null
+++ b/kernel/sched/io_latency.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "sched.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/io_latency.h>
+
+struct io_latency_tree {
+	spinlock_t lock;
+	struct rb_root tree;
+	struct io_latency_node *left_most;
+};
+
+/*
+ * That represents the resolution of the statistics in usec, the latency
+ * for a bucket is BUCKET_INTERVAL * index.
+ * The higher the resolution is the lesser good prediction you will have.
+ * Some measurements:
+ *
+ * For 1ms:
+ *  SSD 6Gb/s       : 99.7%
+ *  SD card class 10: 97.7%
+ *  SD card class 4 : 54.3%
+ *  HDD on USB      : 93.6%
+ *
+ * For 500us:
+ *  SSD 6Gb/s               : 99.9%
+ *  SD card class 10        : 96.8%
+ *  SD card class 4         : 55.8%
+ *  HDD on USB              : 86.3%
+ *
+ * For 200us:
+ *  SSD 6Gb/s               : 99.7%
+ *  SD card class 10        : 95.5%
+ *  SD card class 4         : 29.5%
+ *  HDD on USB              : 66.3%
+ *
+ * For 100us:
+ *  SSD 6Gb/s               : 85.7%
+ *  SD card class 10        : 67.63%
+ *  SD card class 4         : 31.4%
+ *  HDD on USB              : 44.97%
+ *
+ * Aiming a 100% is not necessary good because we want to hit the correct
+ * idle state. Setting a low resolution will group the different latencies
+ * into a big interval which may overlap with the cpuidle state target
+ * residency.
+ *
+ */
+#define BUCKET_INTERVAL 200
+
+/*
+ * Number of successive hits for the same bucket. That is the thresold
+ * triggering the move of the element at the beginning of the list, so
+ * becoming more weighted for the statistics when guessing for the next
+ * latency.
+ */
+#define BUCKET_SUCCESSIVE 5
+
+/*
+ * What is a bucket ?
+ *
+ * A bucket is an interval of latency. This interval is defined with the
+ * BUCKET_INTERVAL. The bucket index gives what latency interval we have.
+ * For example, if you have an index 2 and a bucket interval of 1000usec,
+ * then the bucket contains the latencies 2000 and 2999 usec.
+ *
+ */
+struct bucket {
+	int hits;
+	int successive_hits;
+	int index;
+	int average;
+	struct list_head list;
+};
+
+static struct kmem_cache *bucket_cachep;
+
+static DEFINE_PER_CPU(struct io_latency_tree, latency_trees);
+
+/**
+ * io_latency_bucket_find - Find a bucket associated with the specified index
+ *
+ * @index: the index of the bucket to find
+ * @tsk: the task to retrieve the task list
+ *
+ * Returns the bucket associated with the index, NULL if no bucket is found
+ */
+static struct bucket *io_latency_bucket_find(struct task_struct *tsk, int index)
+{
+	struct list_head *list;
+	struct bucket *bucket = NULL;
+	struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+
+	list_for_each(list, bucket_list) {
+
+		bucket = list_entry(list, struct bucket, list);
+
+		if (bucket->index == index)
+			return bucket;
+	}
+
+	return NULL;
+}
+
+/**
+ * io_latency_bucket_alloc - Allocate a bucket
+ * 
+ * @index: index of the bucket to allow
+ *
+ * Allocate and initialize a bucket structure
+ *
+ * Returns a pointer to a bucket or NULL is the allocation failed
+ */
+static struct bucket *io_latency_bucket_alloc(int index)
+{
+	struct bucket *bucket;
+
+	bucket = kmem_cache_alloc(bucket_cachep, GFP_KERNEL);
+	if (bucket) {
+		bucket->hits  = 0;
+		bucket->successive_hits = 0;
+		bucket->index = index;
+		bucket->average = 0;
+		INIT_LIST_HEAD(&bucket->list);
+	}
+
+	return bucket;
+}
+
+/**
+ * io_latency_guessed_bucket - try to predict the next bucket
+ *
+ * @tsk: the task to get the bucket list
+ *
+ * The list is ordered by history. The first element is the one with
+ * the more *successive* hits. This function is called each time a new
+ * latency is inserted. The algorithm is pretty simple here: As the
+ * first element is the one which more chance to occur next, its
+ * weight is the bigger, the second one has less weight, etc ...
+ *
+ * The bucket which has the maximum score (number of hits weighted by
+ * its position in the list) is the next bucket which has more chances
+ * to occur.
+ *
+ * Returns a pointer to the bucket structure, NULL if there are no
+ * buckets in the list
+ */
+static struct bucket *io_latency_guessed_bucket(struct task_struct *tsk)
+{
+	int weight = 0;
+	int score, score_max = 0;
+	struct bucket *bucket, *winner = NULL;
+	struct list_head *list = NULL;
+	struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+
+	if (list_empty(bucket_list))
+		return NULL;
+
+	list_for_each(list, bucket_list) {
+
+		bucket = list_entry(list, struct bucket, list);
+
+		/*
+		 * The list is ordered by history, the first element has
+		 * more weight the next one
+		 */
+		score = bucket->hits / ((2 * weight) + 1);
+
+		weight++;
+
+		if (score < score_max)
+			continue;
+
+		score_max = score;
+		winner = bucket;
+	}
+
+	return winner;
+}
+
+/*
+ * io_latency_bucket_index - Returns the bucket index for the specified latency
+ *
+ * @latency: the latency fitting a bucket with the specified index
+ *
+ * Returns an integer for the bucket's index
+ */
+static int io_latency_bucket_index(int latency)
+{
+	return latency / BUCKET_INTERVAL;
+}
+
+/*
+ * io_latency_bucket_fill - Compute and fill the bucket list
+ *
+ * @tsk: the task completing an IO
+ * @latency: the latency of the IO
+ *
+ * The dynamic of the list is the following.
+ * - Each new element is inserted at the end of the list
+ * - Each element passing <BUCKET_SUCCESSIVE> times in this function
+ *   is elected to be moved at the beginning at the list
+ *
+ * Returns 0 on success, -1 if a bucket allocation failed
+ */
+static int io_latency_bucket_fill(struct task_struct *tsk, int latency)
+{
+	int diff, index = io_latency_bucket_index(latency);
+	struct bucket *bucket;
+
+	/*
+	 * Find the bucket associated with the index
+	 */
+	bucket = io_latency_bucket_find(tsk, index);
+	if (!bucket) {
+		bucket = io_latency_bucket_alloc(index);
+		if (!bucket)
+			return -1;
+
+		list_add_tail(&bucket->list, &tsk->io_latency.bucket_list);
+	}
+
+	/*
+	 * Increase the number of times this bucket has been hit
+	 */
+	bucket->hits++;
+	bucket->successive_hits++;
+
+	/*
+	 * Compute a sliding average for latency in this bucket
+	 */
+	diff = latency - bucket->average;
+	bucket->average += (diff >> 6);
+
+	/*
+	 * We hit a successive number of times the same bucket, move
+	 * it at the beginning of the list
+	 */
+	if (bucket->successive_hits == BUCKET_SUCCESSIVE) {
+		list_move(&bucket->list, &tsk->io_latency.bucket_list);
+		bucket->successive_hits = 1;
+	}
+
+	return 0;
+}
+
+/*
+ * exit_io_latency - free ressources when the task exits
+ *
+ * @tsk : the exiting task
+ *
+ */
+void exit_io_latency(struct task_struct *tsk)
+{
+	struct list_head *bucket_list = &tsk->io_latency.bucket_list;
+	struct list_head *tmp, *list;
+	struct bucket *bucket;
+
+	list_for_each_safe(list, tmp, bucket_list) {
+
+		list_del(list);
+		bucket = list_entry(list, struct bucket, list);
+		kmem_cache_free(bucket_cachep, bucket);
+	}
+}
+
+/**
+ * io_latency_init : initialization routine
+ *
+ * Initializes the cache pool and the io latency rb trees.
+ */
+void io_latency_init(void)
+{
+	int cpu;
+	struct io_latency_tree *latency_tree;
+	struct rb_root *root;
+
+	bucket_cachep = KMEM_CACHE(bucket, SLAB_PANIC);
+
+	for_each_possible_cpu(cpu) {
+		latency_tree = &per_cpu(latency_trees, cpu);
+		latency_tree->left_most = NULL;
+		spin_lock_init(&latency_tree->lock);
+		root = &latency_tree->tree;
+		root->rb_node = NULL;
+	}
+}
+
+/**
+ * io_latency_get_sleep_length: compute the expected sleep time
+ *
+ * @rq: the runqueue associated with the cpu
+ *
+ * Returns the minimal estimated remaining sleep time for the pending IOs
+ */
+s64 io_latency_get_sleep_length(struct rq *rq)
+{
+	int cpu = rq->cpu;
+	struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+	struct io_latency_node *node;
+	ktime_t now = ktime_get();
+	s64 diff;
+
+	node = latency_tree->left_most;
+
+	if (!node)
+		return 0;
+
+	diff = ktime_to_us(ktime_sub(now, node->start_time));
+	diff = node->avg_latency - diff;
+
+	/* Estimation was wrong, return 0 */
+	if (diff < 0)
+		return 0;
+
+	return diff;
+}
+
+/**
+ * io_latency_avg: compute the io latency sliding average value
+ *
+ * @node: a rb tree node belonging to a task
+ *
+ */
+static void io_latency_avg(struct task_struct *tsk)
+{
+	struct io_latency_node *node = &tsk->io_latency;
+	s64 latency = ktime_to_us(ktime_sub(node->end_time, node->start_time));
+	struct bucket *bucket;
+
+	io_latency_bucket_fill(tsk, latency);
+
+	bucket = io_latency_guessed_bucket(tsk);
+	if (bucket)
+		node->avg_latency = bucket->average;
+}
+
+/**
+ * io_latency_begin - insert the node in the rb tree
+ *
+ * @rq: the runqueue the task is running on
+ * @task: the task being blocked on an IO
+ *
+ * Inserts the node in the rbtree in an ordered manner. If this task
+ * has the minimal io latency of all the tasks blocked on IO, it falls
+ * at the left most node and a shortcut is used.  Stores the start
+ * time of the io schedule.
+ *
+ */
+int io_latency_begin(struct rq *rq, struct task_struct *tsk)
+{
+	int cpu = rq->cpu;
+	struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+	struct rb_root *root = &latency_tree->tree;
+	struct io_latency_node *node = &tsk->io_latency;
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	struct io_latency_node *lat;
+	int leftmost = 1;
+
+	node->start_time = ktime_get();
+
+	spin_lock(&latency_tree->lock);
+
+	while (*new) {
+		lat = rb_entry(*new, struct io_latency_node, node);
+
+		parent = *new;
+
+		if (lat->avg_latency > node->avg_latency)
+			new = &parent->rb_left;
+		else {
+			new = &parent->rb_right;
+			leftmost = 0;
+		}
+	}
+
+	if (leftmost)
+		latency_tree->left_most = node;
+
+	rb_link_node(&node->node, parent, new);
+	rb_insert_color(&node->node, root);
+
+	spin_unlock(&latency_tree->lock);
+
+	return 0;
+}
+
+/**
+ * io_latency_end - Removes the node from the rb tree
+ *
+ * @rq: the runqueue the task belongs to
+ * @tsk: the task woken up after an IO completion
+ *
+ * Removes the node for the rb tree for this cpu. Update the left most
+ * node with the next node if itself it is the left most
+ * node. Retrieves the end time after the io has complete and update
+ * the io latency average time
+ */
+void io_latency_end(struct rq *rq, struct task_struct *tsk)
+{
+	int cpu = rq->cpu;
+	struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu);
+	struct rb_root *root = &latency_tree->tree;
+	struct io_latency_node *old = &tsk->io_latency;
+
+	old->end_time = ktime_get();
+
+	spin_lock(&latency_tree->lock);
+
+	if (latency_tree->left_most == old) {
+		struct rb_node *next_node =
+			rb_next(&latency_tree->left_most->node);
+		latency_tree->left_most =
+			rb_entry(next_node, struct io_latency_node, node);
+	}
+
+	rb_erase(&old->node, root);
+
+	spin_unlock(&latency_tree->lock);
+
+	io_latency_avg(tsk);
+
+	trace_io_latency_entry(
+		ktime_to_us(ktime_sub(old->end_time, old->start_time)),
+		old->avg_latency);
+}
diff --git a/kernel/sched/io_latency.h b/kernel/sched/io_latency.h
new file mode 100644
index 000000000000..abe063ee6417
--- /dev/null
+++ b/kernel/sched/io_latency.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2014 ARM/Linaro
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+
+#ifdef CONFIG_SCHED_IO_LATENCY
+extern void io_latency_init(void);
+extern int  io_latency_begin(struct rq *rq, struct task_struct *tsk);
+extern void io_latency_end(struct rq *rq, struct task_struct *tsk);
+extern int  io_latency_get_sleep_length(struct rq *rq);
+#else
+static inline void io_latency_init(void)
+{
+	;
+}
+
+static inline int io_latency_begin(struct rq *rq, struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void io_latency_end(struct rq *rq, struct task_struct *tsk)
+{
+	;
+}
+
+static inline int io_latency_get_sleep_length(struct rq *rq)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..947663d2935a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -284,6 +284,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "sched_energy_option",
+		.data		= &sysctl_sched_energy_option,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.procname	= "sched_min_granularity_ns",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4e35a5d767ed..6e5b5ba795db 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -812,6 +812,15 @@ config SCHED_DEBUG
 	  that can help debug the scheduler. The runtime overhead of this
 	  option is minimal.
 
+config SCHED_IDLE_DEBUG
+	bool "Collect idle prediction accuracy"
+	depends on CPU_IDLE && DEBUG_FS
+	default n
+	help
+	  If you say Y here, the /sys/kernel/debug/sched/idle directory
+	  will be provided with some idle predictions statistics success.
+	  The overhead is negligeable
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS