Merge branch 'test-sched-cpuidle-tc2' into test-sched-all-tc2

author: Vincent Guittot <vincent.guittot@linaro.org> 2015-02-02 13:38:27 +0100
committer: Vincent Guittot <vincent.guittot@linaro.org> 2015-02-02 13:38:27 +0100
commit: 535883342fd96de91597048a78158faf8e2c59c6 (patch)
tree: b13e25cf53e374416a1efc1b01e185ff8ccf7446
parent: 15bc600daf64ec7478988611650bb18c5a03664d (diff)
parent: 30ae92e267453cc86953200c0f78db0b0995d631 (diff)
27 files changed, 989 insertions, 234 deletions
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 87b704e41877..5d0bcd21a822 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -782,7 +782,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 	if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
 	    !pr->flags.has_cst &&
 	    !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
-		return acpi_idle_enter_c1(dev, drv, CPUIDLE_DRIVER_STATE_START);
+		return acpi_idle_enter_c1(dev, drv, 0);
 #endif
 
 	/*
@@ -830,7 +830,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 	if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
 	    !pr->flags.has_cst &&
 	    !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
-		return acpi_idle_enter_c1(dev, drv, CPUIDLE_DRIVER_STATE_START);
+		return acpi_idle_enter_c1(dev, drv, 0);
 #endif
 
 	if (!cx->bm_sts_skip && acpi_idle_bm_check()) {
@@ -905,7 +905,7 @@ struct cpuidle_driver acpi_idle_driver = {
 static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
 					   struct cpuidle_device *dev)
 {
-	int i, count = CPUIDLE_DRIVER_STATE_START;
+	int i, count = 0;
 	struct acpi_processor_cx *cx;
 
 	if (!pr->flags.power_setup_done)
@@ -950,7 +950,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
  */
 static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 {
-	int i, count = CPUIDLE_DRIVER_STATE_START;
+	int i, count = 0;
 	struct acpi_processor_cx *cx;
 	struct cpuidle_state *state;
 	struct cpuidle_driver *drv = &acpi_idle_driver;
@@ -985,6 +985,8 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 		state->flags = 0;
 		switch (cx->type) {
 			case ACPI_STATE_C1:
+			if (cx->entry_method != ACPI_CSTATE_FFH)
+				state->flags |= CPUIDLE_FLAG_TIME_INVALID;
 
 			state->enter = acpi_idle_enter_c1;
 			state->enter_dead = acpi_idle_play_dead;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index db90aa35cb71..062297326f07 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -2440,7 +2440,8 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 		int i;
 
 		rc = devm_request_irq(dev, pdev->irq, irq_handler,
-				      IRQF_SHARED, drv_name, host);
+				      IRQF_SHARED | IRQF_TIMINGS,
+				      drv_name, host);
 		if (rc)
 			goto out;
 
@@ -2452,7 +2453,8 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 	} else if (legacy_mode) {
 		if (!ata_port_is_dummy(host->ports[0])) {
 			rc = devm_request_irq(dev, ATA_PRIMARY_IRQ(pdev),
-					      irq_handler, IRQF_SHARED,
+					      irq_handler,
+					      IRQF_SHARED | IRQF_TIMINGS,
 					      drv_name, host);
 			if (rc)
 				goto out;
@@ -2463,7 +2465,8 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 
 		if (!ata_port_is_dummy(host->ports[1])) {
 			rc = devm_request_irq(dev, ATA_SECONDARY_IRQ(pdev),
-					      irq_handler, IRQF_SHARED,
+					      irq_handler,
+					      IRQF_SHARED | IRQF_TIMINGS,
 					      drv_name, host);
 			if (rc)
 				goto out;
diff --git a/drivers/ata/pata_mpiix.c b/drivers/ata/pata_mpiix.c
index 202b4d601393..b7663242cd60 100644
--- a/drivers/ata/pata_mpiix.c
+++ b/drivers/ata/pata_mpiix.c
@@ -208,7 +208,7 @@ static int mpiix_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 	ata_sff_std_ports(&ap->ioaddr);
 
 	/* activate host */
-	return ata_host_activate(host, irq, ata_sff_interrupt, IRQF_SHARED,
+	return ata_host_activate(host, irq, ata_sff_interrupt, IRQF_SHARED | IRQF_TIMINGS,
 				 &mpiix_sht);
 }
 
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c5029c1209b4..1aae78bc9d95 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -25,6 +25,10 @@ config CPU_IDLE_GOV_MENU
 	bool "Menu governor (for tickless system)"
 	default y
 
+config CPU_IDLE_GOV_IRQ
+	bool "Irq governor (for tickless system)"
+	default y
+
 config DT_IDLE_STATES
 	bool
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150dc6e81..1924f4e0a3e7 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,16 +8,12 @@
  * This code is licenced under the GPL.
  */
 
-#include <linux/clockchips.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <trace/events/power.h>
 
@@ -58,7 +54,7 @@ int cpuidle_play_dead(void)
 		return -ENODEV;
 
 	/* Find lowest-power state that supports long-term idle */
-	for (i = drv->state_count - 1; i >= CPUIDLE_DRIVER_STATE_START; i--)
+	for (i = drv->state_count - 1; i >= 0; i--)
 		if (drv->states[i].enter_dead)
 			return drv->states[i].enter_dead(dev, i);
 
@@ -81,24 +77,33 @@ void cpuidle_use_deepest_state(bool enable)
 }
 
 /**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
+ * cpuidle_find_state - Find an idle state given the constraints
+ *
  * @drv: cpuidle driver for a given CPU.
  * @dev: cpuidle device for a given CPU.
+ *
+ * Returns an index of the state fulfilling the time constraint passed as
+ * parameter, -1 otherwise
+ *
  */
-static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+int cpuidle_find_state(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       unsigned int sleep_time, unsigned int latency_req)
 {
-	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = -1;
 
-	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+	for (i = 0; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable)
+			continue;
+
+		if (s->target_residency > sleep_time)
+			continue;
+
+		if (s->exit_latency > latency_req)
 			continue;
 
-		latency_req = s->exit_latency;
 		ret = i;
 	}
 	return ret;
@@ -116,21 +121,54 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	int entered_state;
 
 	struct cpuidle_state *target_state = &drv->states[index];
-	ktime_t time_start, time_end;
 	s64 diff;
 
 	trace_cpu_idle_rcuidle(index, dev->cpu);
-	time_start = ktime_get();
 
+	/*
+	 * Store the idle start time for this cpu, this information
+	 * will be used by cpuidle to measure how long the cpu has
+	 * been idle and by the scheduler to prevent to wake it up too
+	 * early
+	 */
+	target_state->idle_stamp = ktime_to_us(ktime_get());
+
+	/*
+	 * The enter the low level idle routine. This call will block
+	 * until an interrupt occurs meaning it is the end of the idle
+	 * period
+	 */
 	entered_state = target_state->enter(dev, drv, index);
 
-	time_end = ktime_get();
+	/*
+	 * Measure as soon as possible the duration of the idle
+	 * period. It MUST be done before re-enabling the interrupt in
+	 * order to prevent to add in the idle time measurement the
+	 * interrupt handling duration
+	 */
+	diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
+
+	/*
+	 * Reset the idle time stamp as the scheduler thinks the cpu is idle
+	 * while it is in the process of waking up
+	 */
+	target_state->idle_stamp = 0;
+
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
+	/*
+	 * The cpuidle_enter_coupled uses the cpuidle_enter function.
+	 * Don't re-enable the interrupts and let the enter_coupled
+	 * function to wait for all cpus to sync and to enable the
+	 * interrupts again from there
+	 */
 	if (!cpuidle_state_is_coupled(dev, drv, entered_state))
 		local_irq_enable();
 
-	diff = ktime_to_us(ktime_sub(time_end, time_start));
+	/*
+	 * The idle duration will be casted to an integer, prevent to
+	 * overflow by setting a boundary to INT_MAX
+	 */
 	if (diff > INT_MAX)
 		diff = INT_MAX;
 
@@ -143,6 +181,16 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 		 */
 		dev->states_usage[entered_state].time += dev->last_residency;
 		dev->states_usage[entered_state].usage++;
+		
+		if (diff < drv->states[entered_state].target_residency) {
+			atomic_inc(&dev->over_estimate);
+		} else if (entered_state < (drv->state_count - 1) &&
+			   diff >= 
+			   drv->states[entered_state + 1].target_residency) {
+			atomic_inc(&dev->under_estimate);
+		} else {
+			atomic_inc(&dev->right_estimate);
+		}
 	} else {
 		dev->last_residency = 0;
 	}
@@ -155,10 +203,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
  *
  * @drv: the cpuidle driver
  * @dev: the cpuidle device
+ * @latency_req: the latency constraint when choosing an idle state
+ * @next_timer_event: the duration until the timer expires
  *
  * Returns the index of the idle state.
  */
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		   int latency_req, s64 next_timer_event)
 {
 	if (off || !initialized)
 		return -ENODEV;
@@ -167,9 +218,10 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		return -EBUSY;
 
 	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
+		return cpuidle_find_state(drv, dev, UINT_MAX, UINT_MAX);
 
-	return cpuidle_curr_governor->select(drv, dev);
+	return cpuidle_curr_governor->select(drv, dev, latency_req,
+					     next_timer_event);
 }
 
 /**
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 2697e87d5b34..16d67aa1c801 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -178,36 +178,6 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 	}
 }
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev,
-		struct cpuidle_driver *drv, int index)
-{
-	local_irq_enable();
-	if (!current_set_polling_and_test()) {
-		while (!need_resched())
-			cpu_relax();
-	}
-	current_clr_polling();
-
-	return index;
-}
-
-static void poll_idle_init(struct cpuidle_driver *drv)
-{
-	struct cpuidle_state *state = &drv->states[0];
-
-	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
-	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
-	state->exit_latency = 0;
-	state->target_residency = 0;
-	state->power_usage = -1;
-	state->enter = poll_idle;
-	state->disabled = false;
-}
-#else
-static void poll_idle_init(struct cpuidle_driver *drv) {}
-#endif /* !CONFIG_ARCH_HAS_CPU_RELAX */
-
 /**
  * __cpuidle_register_driver: register the driver
  * @drv: a valid pointer to a struct cpuidle_driver
@@ -241,8 +211,6 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
 		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
 				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1);
 
-	poll_idle_init(drv);
-
 	return 0;
 }
 
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
index 1b512722689f..8804ee2f550c 100644
--- a/drivers/cpuidle/governors/Makefile
+++ b/drivers/cpuidle/governors/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
 obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_IRQ) += irq.o
diff --git a/drivers/cpuidle/governors/irq.c b/drivers/cpuidle/governors/irq.c
new file mode 100644
index 000000000000..de99f4545255
--- /dev/null
+++ b/drivers/cpuidle/governors/irq.c
@@ -0,0 +1,33 @@
+/*
+ * irq.c - the irq governor
+ *
+ * Copyright (C) 2014 Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+*/
+#include <linux/ktime.h>
+#include <linux/irq.h>
+#include <linux/cpuidle.h>
+
+static int select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		  int latency_req, s64 next_timer_event)
+{
+	s64 next_irq_event = irqt_get_next_prediction(dev->cpu);
+	s64 next_event = next_irq_event ? 
+		min(next_irq_event, next_timer_event) : next_timer_event; 
+
+	return cpuidle_find_state(drv, dev, next_event, latency_req);
+}
+
+static struct cpuidle_governor irq_governor = {
+	.name   = "irq",
+	.rating = 30,
+	.select = select,
+	.owner  = THIS_MODULE,
+};
+
+static int __init irq_init(void)
+{
+	return cpuidle_register_governor(&irq_governor);
+}
+
+postcore_initcall(irq_init);
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 401c0106ed34..5a76dff631b9 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -64,22 +64,21 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
  * @dev: the CPU
  */
 static int ladder_select_state(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev)
+			       struct cpuidle_device *dev,
+			       int latency_req, s64 next_timer_event)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct ladder_device_state *last_state;
 	int last_residency, last_idx = ldev->last_state_idx;
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-
-	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0)) {
-		ladder_do_selection(ldev, last_idx, 0);
-		return 0;
-	}
 
 	last_state = &ldev->states[last_idx];
 
-	last_residency = cpuidle_get_last_residency(dev) - drv->states[last_idx].exit_latency;
+	if (!(drv->states[last_idx].flags & CPUIDLE_FLAG_TIME_INVALID)) {
+		last_residency = cpuidle_get_last_residency(dev) - \
+					 drv->states[last_idx].exit_latency;
+	}
+	else
+		last_residency = last_state->threshold.promotion_time + 1;
 
 	/* consider promotion */
 	if (last_idx < drv->state_count - 1 &&
@@ -96,13 +95,13 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	}
 
 	/* consider demotion */
-	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
+	if (last_idx > 0 &&
 	    (drv->states[last_idx].disabled ||
 	    dev->states_usage[last_idx].disable ||
 	    drv->states[last_idx].exit_latency > latency_req)) {
 		int i;
 
-		for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
+		for (i = last_idx - 1; i > 0; i--) {
 			if (drv->states[i].exit_latency <= latency_req)
 				break;
 		}
@@ -110,7 +109,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 		return i;
 	}
 
-	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
+	if (last_idx > 0 &&
 	    last_residency < last_state->threshold.demotion_time) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
@@ -137,9 +136,9 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 	struct ladder_device_state *lstate;
 	struct cpuidle_state *state;
 
-	ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START;
+	ldev->last_state_idx = 0;
 
-	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+	for (i = 0; i < drv->state_count; i++) {
 		state = &drv->states[i];
 		lstate = &ldev->states[i];
 
@@ -151,7 +150,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 
 		if (i < drv->state_count - 1)
 			lstate->threshold.promotion_time = state->exit_latency;
-		if (i > CPUIDLE_DRIVER_STATE_START)
+		if (i > 0)
 			lstate->threshold.demotion_time = state->exit_latency;
 	}
 
@@ -166,8 +165,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 static void ladder_reflect(struct cpuidle_device *dev, int index)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
-	if (index > 0)
-		ldev->last_state_idx = index;
+	ldev->last_state_idx = index;
 }
 
 static struct cpuidle_governor ladder_governor = {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 40580794e23d..ac2be02de5c6 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -13,10 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/pm_qos.h>
-#include <linux/time.h>
-#include <linux/ktime.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
 #include <linux/module.h>
@@ -188,7 +184,6 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
 
-static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 
 /* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */
 static u64 div_round64(u64 dividend, u32 divisor)
@@ -196,13 +191,87 @@ static u64 div_round64(u64 dividend, u32 divisor)
 	return div_u64(dividend + (divisor / 2), divisor);
 }
 
+/**
+ * menu_update - attempts to guess what happened after entry
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ */
+static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+	struct menu_device *data = this_cpu_ptr(&menu_devices);
+	int last_idx = data->last_state_idx;
+	struct cpuidle_state *target = &drv->states[last_idx];
+	unsigned int measured_us;
+	unsigned int new_factor;
+
+	/*
+	 * Try to figure out how much time passed between entry to low
+	 * power state and occurrence of the wakeup event.
+	 *
+	 * If the entered idle state didn't support residency measurements,
+	 * we are basically lost in the dark how much time passed.
+	 * As a compromise, assume we slept for the whole expected time.
+	 *
+	 * Any measured amount of time will include the exit latency.
+	 * Since we are interested in when the wakeup begun, not when it
+	 * was completed, we must subtract the exit latency. However, if
+	 * the measured amount of time is less than the exit latency,
+	 * assume the state was never reached and the exit latency is 0.
+	 */
+	if (unlikely(target->flags & CPUIDLE_FLAG_TIME_INVALID)) {
+		/* Use timer value as is */
+		measured_us = data->next_timer_us;
+
+	} else {
+		/* Use measured value */
+		measured_us = cpuidle_get_last_residency(dev);
+
+		/* Deduct exit latency */
+		if (measured_us > target->exit_latency)
+			measured_us -= target->exit_latency;
+
+		/* Make sure our coefficients do not exceed unity */
+		if (measured_us > data->next_timer_us)
+			measured_us = data->next_timer_us;
+	}
+
+	/* Update our correction ratio */
+	new_factor = data->correction_factor[data->bucket];
+	new_factor -= new_factor / DECAY;
+
+	if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
+		new_factor += RESOLUTION * measured_us / data->next_timer_us;
+	else
+		/*
+		 * we were idle so long that we count it as a perfect
+		 * prediction
+		 */
+		new_factor += RESOLUTION;
+
+	/*
+	 * We don't want 0 as factor; we always want at least
+	 * a tiny bit of estimated time. Fortunately, due to rounding,
+	 * new_factor will stay nonzero regardless of measured_us values
+	 * and the compiler can eliminate this test as long as DECAY > 1.
+	 */
+	if (DECAY == 1 && unlikely(new_factor == 0))
+		new_factor = 1;
+
+	data->correction_factor[data->bucket] = new_factor;
+
+	/* update the repeating-pattern data */
+	data->intervals[data->interval_ptr++] = measured_us;
+	if (data->interval_ptr >= INTERVALS)
+		data->interval_ptr = 0;
+}
+
 /*
  * Try detecting repeating patterns by keeping track of the last 8
  * intervals, and checking if the standard deviation of that set
  * of points is below a threshold. If it is... then use the
  * average of these 8 points as the estimated value.
  */
-static void get_typical_interval(struct menu_device *data)
+static unsigned int get_typical_interval(struct menu_device *data)
 {
 	int i, divisor;
 	unsigned int max, thresh;
@@ -259,11 +328,8 @@ again:
 	if (likely(stddev <= ULONG_MAX)) {
 		stddev = int_sqrt(stddev);
 		if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
-							|| stddev <= 20) {
-			if (data->next_timer_us > avg)
-				data->predicted_us = avg;
-			return;
-		}
+							|| stddev <= 20)
+			return avg;
 	}
 
 	/*
@@ -276,7 +342,7 @@ again:
 	 * with sporadic activity with a bunch of short pauses.
 	 */
 	if ((divisor * 4) <= INTERVALS * 3)
-		return;
+		return 0;
 
 	thresh = max - 1;
 	goto again;
@@ -287,12 +353,12 @@ again:
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
  */
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       int latency_req, s64 next_timer_event)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-	int i;
 	unsigned int interactivity_req;
+	unsigned int interactivity_overrride_us;
 	unsigned long nr_iowaiters, cpu_load;
 
 	if (data->needs_update) {
@@ -300,14 +366,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		data->needs_update = 0;
 	}
 
-	data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
-
-	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0))
-		return 0;
+	data->last_state_idx = 0;
 
 	/* determine the expected residency time, round up */
-	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+	data->next_timer_us = next_timer_event;
 
 	get_iowait_load(&nr_iowaiters, &cpu_load);
 	data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -321,7 +383,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 					 data->correction_factor[data->bucket],
 					 RESOLUTION * DECAY);
 
-	get_typical_interval(data);
+	interactivity_overrride_us = get_typical_interval(data);
+	if (interactivity_overrride_us &&
+	    data->next_timer_us > interactivity_overrride_us)
+		data->predicted_us = interactivity_overrride_us;
 
 	/*
 	 * Performance multiplier defines a minimum predicted idle
@@ -333,31 +398,11 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		latency_req = interactivity_req;
 
 	/*
-	 * We want to default to C1 (hlt), not to busy polling
-	 * unless the timer is happening really really soon.
-	 */
-	if (data->next_timer_us > 5 &&
-	    !drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
-		dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
-		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
-
-	/*
 	 * Find the idle state with the lowest power while satisfying
 	 * our constraints.
 	 */
-	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
-		struct cpuidle_state *s = &drv->states[i];
-		struct cpuidle_state_usage *su = &dev->states_usage[i];
-
-		if (s->disabled || su->disable)
-			continue;
-		if (s->target_residency > data->predicted_us)
-			continue;
-		if (s->exit_latency > latency_req)
-			continue;
-
-		data->last_state_idx = i;
-	}
+	data->last_state_idx = cpuidle_find_state(drv, dev, data->predicted_us,
+						  latency_req);
 
 	return data->last_state_idx;
 }
@@ -374,77 +419,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
 	data->last_state_idx = index;
-	if (index >= 0)
-		data->needs_update = 1;
-}
-
-/**
- * menu_update - attempts to guess what happened after entry
- * @drv: cpuidle driver containing state data
- * @dev: the CPU
- */
-static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
-{
-	struct menu_device *data = this_cpu_ptr(&menu_devices);
-	int last_idx = data->last_state_idx;
-	struct cpuidle_state *target = &drv->states[last_idx];
-	unsigned int measured_us;
-	unsigned int new_factor;
-
-	/*
-	 * Try to figure out how much time passed between entry to low
-	 * power state and occurrence of the wakeup event.
-	 *
-	 * If the entered idle state didn't support residency measurements,
-	 * we use them anyway if they are short, and if long,
-	 * truncate to the whole expected time.
-	 *
-	 * Any measured amount of time will include the exit latency.
-	 * Since we are interested in when the wakeup begun, not when it
-	 * was completed, we must subtract the exit latency. However, if
-	 * the measured amount of time is less than the exit latency,
-	 * assume the state was never reached and the exit latency is 0.
-	 */
-
-	/* measured value */
-	measured_us = cpuidle_get_last_residency(dev);
-
-	/* Deduct exit latency */
-	if (measured_us > target->exit_latency)
-		measured_us -= target->exit_latency;
-
-	/* Make sure our coefficients do not exceed unity */
-	if (measured_us > data->next_timer_us)
-		measured_us = data->next_timer_us;
-
-	/* Update our correction ratio */
-	new_factor = data->correction_factor[data->bucket];
-	new_factor -= new_factor / DECAY;
-
-	if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
-		new_factor += RESOLUTION * measured_us / data->next_timer_us;
-	else
-		/*
-		 * we were idle so long that we count it as a perfect
-		 * prediction
-		 */
-		new_factor += RESOLUTION;
-
-	/*
-	 * We don't want 0 as factor; we always want at least
-	 * a tiny bit of estimated time. Fortunately, due to rounding,
-	 * new_factor will stay nonzero regardless of measured_us values
-	 * and the compiler can eliminate this test as long as DECAY > 1.
-	 */
-	if (DECAY == 1 && unlikely(new_factor == 0))
-		new_factor = 1;
-
-	data->correction_factor[data->bucket] = new_factor;
-
-	/* update the repeating-pattern data */
-	data->intervals[data->interval_ptr++] = measured_us;
-	if (data->interval_ptr >= INTERVALS)
-		data->interval_ptr = 0;
+	data->needs_update = 1;
 }
 
 /**
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 97c5903b4606..f446bd0fd9bd 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -439,6 +439,154 @@ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 		cpuidle_free_state_kobj(device, i);
 }
 
+#define kobj_to_stats_kobj(k) container_of(k, struct cpuidle_stats_kobj, kobj)
+#define attr_to_stats_attr(a) container_of(a, struct cpuidle_stats_attr, attr)
+
+#define define_show_stats_function(_name)				\
+	static ssize_t show_stats_##_name(struct cpuidle_device *dev,	\
+					  char *buf)			\
+	{								\
+		return sprintf(buf, "%d\n", atomic_read(&dev->_name));	\
+	}
+
+#define define_store_stats_function(_name)				\
+	static ssize_t store_stats_##_name(struct cpuidle_device *dev,	\
+					   const char *buf, size_t size) \
+	{								\
+		unsigned long long value;				\
+		int err;						\
+		if (!capable(CAP_SYS_ADMIN))				\
+			return -EPERM;					\
+		err = kstrtoull(buf, 0, &value);			\
+		if (err)						\
+			return err;					\
+									\
+		atomic_set(&dev->_name, value);				\
+		return size;						\
+	}
+
+#define define_one_stats_rw(_name, show, store)		       \
+	static struct cpuidle_stats_attr attr_stats_##_name = \
+		__ATTR(_name, 0644, show, store)
+
+struct cpuidle_stats_kobj {
+	struct cpuidle_device *dev;
+	struct completion kobj_unregister;
+	struct kobject kobj;
+};
+
+struct cpuidle_stats_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpuidle_device *, char *);
+	ssize_t (*store)(struct cpuidle_device *, const char *, size_t);
+};
+
+static void cpuidle_stats_sysfs_release(struct kobject *kobj)
+{
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	complete(&stats_kobj->kobj_unregister);
+}
+
+static ssize_t cpuidle_stats_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	int ret = -EIO;
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+	if (dattr->show)
+		ret = dattr->show(stats_kobj->dev, buf);
+
+	return ret;
+}
+
+static ssize_t cpuidle_stats_store(struct kobject *kobj,
+				   struct attribute *attr,
+				   const char *buf, size_t size)
+{
+	int ret = -EIO;
+	struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj);
+	struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr);
+
+	if (dattr->store)
+		ret = dattr->store(stats_kobj->dev, buf, size);
+
+	return ret;
+}
+
+define_show_stats_function(right_estimate);
+define_store_stats_function(right_estimate);
+
+define_show_stats_function(under_estimate);
+define_store_stats_function(under_estimate);
+
+define_show_stats_function(over_estimate);
+define_store_stats_function(over_estimate);
+
+define_one_stats_rw(right_estimate,
+		    show_stats_right_estimate,
+		    store_stats_right_estimate);
+
+define_one_stats_rw(under_estimate,
+		    show_stats_under_estimate,
+		    store_stats_under_estimate);
+
+define_one_stats_rw(over_estimate,
+		    show_stats_over_estimate,
+		    store_stats_over_estimate);
+
+static const struct sysfs_ops cpuidle_stats_sysfs_ops = {
+	.show = cpuidle_stats_show,
+	.store = cpuidle_stats_store,
+};
+
+static struct attribute *cpuidle_stats_default_attrs[] = {
+	&attr_stats_right_estimate.attr,
+	&attr_stats_under_estimate.attr,
+	&attr_stats_over_estimate.attr,
+	NULL
+};
+
+static struct kobj_type ktype_stats_cpuidle = {
+	.sysfs_ops = &cpuidle_stats_sysfs_ops,
+	.default_attrs = cpuidle_stats_default_attrs,
+	.release = cpuidle_stats_sysfs_release,
+};
+
+static int cpuidle_add_stats_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_stats_kobj *kstats;
+	struct cpuidle_device_kobj *kdev = dev->kobj_dev;
+	int ret;
+
+	kstats = kzalloc(sizeof(*kstats), GFP_KERNEL);
+	if (!kstats)
+		return -ENOMEM;
+
+	kstats->dev = dev;
+	init_completion(&kstats->kobj_unregister);
+
+	ret = kobject_init_and_add(&kstats->kobj, &ktype_stats_cpuidle,
+				   &kdev->kobj, "stats");
+	if (ret) {
+		kfree(kstats);
+		return ret;
+	}
+
+	kobject_uevent(&kstats->kobj, KOBJ_ADD);
+	dev->kobj_stats = kstats;
+
+	return ret;
+}
+
+static void cpuidle_remove_stats_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_stats_kobj *kstats = dev->kobj_stats;
+	kobject_put(&kstats->kobj);
+	wait_for_completion(&kstats->kobj_unregister);
+	kfree(kstats);
+}
+
 #ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
 #define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj)
 #define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr)
@@ -589,6 +737,13 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
 	ret = cpuidle_add_driver_sysfs(device);
 	if (ret)
 		cpuidle_remove_state_sysfs(device);
+
+	ret = cpuidle_add_stats_sysfs(device);
+	if (ret) {
+		cpuidle_remove_driver_sysfs(device);
+		cpuidle_remove_state_sysfs(device);
+	}
+
 	return ret;
 }
 
@@ -598,6 +753,7 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device)
  */
 void cpuidle_remove_device_sysfs(struct cpuidle_device *device)
 {
+	cpuidle_remove_stats_sysfs(device);
 	cpuidle_remove_driver_sysfs(device);
 	cpuidle_remove_state_sysfs(device);
 }
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9cceacb92f9d..f735355e0df5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -845,8 +845,6 @@ static int __init intel_idle_cpuidle_driver_init(void)
 
 	intel_idle_state_table_update();
 
-	drv->state_count = 1;
-
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
 		int num_substates, mwait_hint, mwait_cstate;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index ab70f3bc44ad..e1f4914409b3 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -44,6 +44,7 @@ struct cpuidle_state {
 	int		power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
 	bool		disabled; /* disabled on all CPUs */
+	u64             idle_stamp;
 
 	int (*enter)	(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
@@ -53,6 +54,7 @@ struct cpuidle_state {
 };
 
 /* Idle State Flags */
+#define CPUIDLE_FLAG_TIME_INVALID	(0x01) /* is residency time measurable? */
 #define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
 
@@ -61,6 +63,7 @@ struct cpuidle_state {
 struct cpuidle_device_kobj;
 struct cpuidle_state_kobj;
 struct cpuidle_driver_kobj;
+struct cpuidle_stats_kobj;
 
 struct cpuidle_device {
 	unsigned int		registered:1;
@@ -73,8 +76,13 @@ struct cpuidle_device {
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 	struct cpuidle_driver_kobj *kobj_driver;
 	struct cpuidle_device_kobj *kobj_dev;
+	struct cpuidle_stats_kobj  *kobj_stats;
 	struct list_head 	device_list;
 
+	atomic_t right_estimate;
+	atomic_t under_estimate;
+	atomic_t over_estimate;
+
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 	int			safe_state_index;
 	cpumask_t		coupled_cpus;
@@ -88,6 +96,8 @@ DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 /**
  * cpuidle_get_last_residency - retrieves the last state's residency time
  * @dev: the target CPU
+ *
+ * NOTE: this value is invalid if CPUIDLE_FLAG_TIME_INVALID is set
  */
 static inline int cpuidle_get_last_residency(struct cpuidle_device *dev)
 {
@@ -118,8 +128,14 @@ struct cpuidle_driver {
 #ifdef CONFIG_CPU_IDLE
 extern void disable_cpuidle(void);
 
+extern int cpuidle_find_state(struct cpuidle_driver *drv,
+			      struct cpuidle_device *dev,
+			      unsigned int sleep_time,
+			      unsigned int latency_req);
+
 extern int cpuidle_select(struct cpuidle_driver *drv,
-			  struct cpuidle_device *dev);
+			  struct cpuidle_device *dev,
+			  int latency_req, s64 next_timer_event);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
 			 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -146,8 +162,14 @@ extern void cpuidle_use_deepest_state(bool enable);
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
 static inline void disable_cpuidle(void) { }
+static inline int cpuidle_find_state(struct cpuidle_driver *drv,
+				     struct cpuidle_device *dev,
+				     unsigned int sleep_time,
+				     unsigned int latency_req)
+{return -ENODEV; }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
-				 struct cpuidle_device *dev)
+				 struct cpuidle_device *dev,
+				 int latency_req, s64 next_timer_event)
 {return -ENODEV; }
 static inline int cpuidle_enter(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
@@ -202,7 +224,8 @@ struct cpuidle_governor {
 					struct cpuidle_device *dev);
 
 	int  (*select)		(struct cpuidle_driver *drv,
-					struct cpuidle_device *dev);
+				 struct cpuidle_device *dev,
+				 int latency_req, s64 next_timer_event);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 
 	struct module 		*owner;
@@ -215,10 +238,4 @@ static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
 {return 0;}
 #endif
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
-#define CPUIDLE_DRIVER_STATE_START	1
-#else
-#define CPUIDLE_DRIVER_STATE_START	0
-#endif
-
 #endif /* _LINUX_CPUIDLE_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index d9b05b5bf8c7..bb7dddc33918 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -59,6 +59,7 @@
  *                resume time.
  */
 #define IRQF_DISABLED		0x00000020
+#define IRQF_TIMINGS		0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
 #define __IRQF_TIMER		0x00000200
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d09ec7a1243e..8150c1cfc7f4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -862,4 +862,10 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 		return readl(gc->reg_base + reg_offset);
 }
 
+#ifdef CONFIG_IRQ_TIMINGS
+extern s64 irqt_get_next_prediction(int cpu);
+#else
+static inline s64 irqt_get_next_prediction(int cpu) { return 0; }
+#endif
+
 #endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index faf433af425e..3d723a946071 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -14,11 +14,13 @@ struct module;
 struct irq_desc;
 struct irq_domain;
 struct pt_regs;
+struct irqt_stat;
 
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_data:		per irq and chip data passed down to chip functions
  * @kstat_irqs:		irq stats per cpu
+ * @irq_timings:		irq occurrence timing statistics
  * @handle_irq:		highlevel irq-events handler
  * @preflow_handler:	handler called before the flow handler (currently used by sparc)
  * @action:		the irq action chain
@@ -49,6 +51,9 @@ struct pt_regs;
 struct irq_desc {
 	struct irq_data		irq_data;
 	unsigned int __percpu	*kstat_irqs;
+#ifdef CONFIG_IRQ_TIMINGS
+	struct irqt_stat *irq_timings;
+#endif
 	irq_flow_handler_t	handle_irq;
 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
 	irq_preflow_handler_t	preflow_handler;
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 3608bebd3d9c..53910cf3917d 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -84,6 +84,51 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+#ifdef CONFIG_IRQ_TIMINGS
+/**
+ * irq_timings - provide updated IRQ timing statistics
+ * @irq: irq number
+ * @interval: time interval since last irq
+ * @variance: time interval variance
+ * @mean: mean interval
+ * @good: current count of predictable irqs
+ * @bad: current count of unpredictable irqs
+ *
+ * Note: variance is provided/listed before the mean value to help with
+ * alignment constraints on 64-bit values.
+ */
+TRACE_EVENT(irq_timings,
+
+	TP_PROTO(int irq, u32 interval, u64 variance, u32 mean,
+		 u32 good, u32 bad),
+
+	TP_ARGS(irq, interval, variance, mean, good, bad),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq       )
+		__field(	u32,	interval  )
+		__field(	u64,	variance )
+		__field(	u32,	mean      )
+		__field(	u32,	good      )
+		__field(	u32,	bad       )
+	),
+
+	TP_fast_assign(
+		__entry->irq       = irq;
+		__entry->interval  = interval;
+		__entry->variance = variance;
+		__entry->mean      = mean;
+		__entry->good      = good;
+		__entry->bad       = bad;
+	),
+
+	TP_printk("irq=%d intv=%u mean=%u variance=%llu (%u vs %u)",
+		  __entry->irq, __entry->interval, __entry->mean,
+		  (unsigned long long)__entry->variance,
+		  __entry->good, __entry->bad)
+);
+#endif
+
 DECLARE_EVENT_CLASS(softirq,
 
 	TP_PROTO(unsigned int vec_nr),
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9a76e3beda54..3a134b685552 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -100,4 +100,9 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
+# Support for IRQ timing stats and prediction, mainly for cpuidle usage
+config IRQ_TIMINGS
+	bool
+	default y
+
 endmenu
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index d12123526e2b..fad4b6d2768a 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_IRQ_TIMINGS) += timings.o
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..a0e7522b2b4f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -110,6 +110,23 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
+#ifdef CONFIG_IRQ_TIMINGS
+extern void __init irqt_init(void);
+extern void irqt_process(unsigned int irq, struct irqt_stat *s);
+static inline void irqt_event(int irq, struct irq_desc *desc)
+{
+	if (desc->irq_timings)
+		irqt_process(irq, desc->irq_timings);
+}
+extern int irqt_register(struct irq_desc *desc);
+extern void irqt_unregister(struct irq_desc *desc);
+#else
+static inline void irqt_init(void) { }
+static inline void irqt_event(int irq, struct irq_desc *desc) { }
+static inline int irqt_register(struct irq_desc *desc) { return 0; }
+static inline void irqt_unregister(struct irq_desc *desc) { }
+#endif
+
 extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
@@ -197,6 +214,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
 {
 	__this_cpu_inc(*desc->kstat_irqs);
 	__this_cpu_inc(kstat.irqs_sum);
+	irqt_event(irq, desc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6d23..f31471ebce36 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -232,6 +232,7 @@ int __init early_irq_init(void)
 	int i, initcnt, node = first_online_node;
 	struct irq_desc *desc;
 
+	irqt_init();
 	init_irq_default_affinity();
 
 	/* Let arch update nr_irqs and return the nr of preallocated irqs */
@@ -270,6 +271,7 @@ int __init early_irq_init(void)
 	int count, i, node = first_online_node;
 	struct irq_desc *desc;
 
+	irqt_init();
 	init_irq_default_affinity();
 
 	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..88b487b355ae 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1231,6 +1231,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	register_handler_proc(irq, new);
 	free_cpumask_var(mask);
 
+	if (new->flags & IRQF_TIMINGS)
+		irqt_register(desc);
+
 	return 0;
 
 mismatch:
@@ -1328,6 +1331,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	if (!desc->action) {
 		irq_shutdown(desc);
 		irq_release_resources(desc);
+		irqt_unregister(desc);
 	}
 
 #ifdef CONFIG_SMP
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..4cda809b8512 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,8 +10,10 @@
 #include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/uaccess.h>
 
 #include "internals.h"
 
@@ -283,6 +285,62 @@ static const struct file_operations irq_spurious_proc_fops = {
 	.release	= single_release,
 };
 
+static int irq_timings_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long) m->private);
+
+	seq_printf(m, "%d\n", desc->irq_timings ? 1 : 0);
+
+	return 0;
+}
+
+static int irq_timings_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_timings_proc_show, PDE_DATA(inode));
+}
+
+static ssize_t irq_timings_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
+{
+	long enable;
+	int ret;
+	int irq = (int)(long)PDE_DATA(file_inode(file));
+	struct irq_desc *desc = irq_to_desc(irq);
+	char *buf;
+
+	buf = kzalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(buf, buffer, count))
+		goto out;
+
+	ret = kstrtoul(buf, 0, &enable);
+	if (ret < 0)
+		goto out;
+
+ 	if (enable) {
+ 		ret = irqt_register(desc);
+ 	} else {
+ 		unsigned long flags;
+ 		raw_spin_lock_irqsave(&desc->lock, flags);
+ 		irqt_unregister(desc);
+ 		raw_spin_unlock_irqrestore(&desc->lock, flags);
+ 	}
+out:
+	kfree(buf);
+	return ret ? ret : count;
+}
+
+static const struct file_operations irq_timings_proc_fops = {
+	.open = irq_timings_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.write = irq_timings_proc_write,
+};
+
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -358,6 +416,11 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 	proc_create_data("spurious", 0444, desc->dir,
 			 &irq_spurious_proc_fops, (void *)(long)irq);
+#ifdef CONFIG_IRQ_TIMINGS
+	/* create /proc/irq/<irq>/timings */
+	proc_create_data("timings", 0644, desc->dir,
+			 &irq_timings_proc_fops, (void *)(long)irq);
+#endif
 }
 
 void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -373,7 +436,9 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 	remove_proc_entry("node", desc->dir);
 #endif
 	remove_proc_entry("spurious", desc->dir);
-
+#ifdef CONFIG_IRQ_TIMING
+	remove_proc_entry("timing", desc->dir);
+#endif
 	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%u", irq);
 	remove_proc_entry(name, root_irq_dir);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 000000000000..27994cea4a99
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,338 @@
+/*
+ * IRQ occurrence timing statistics
+ *
+ * Created by:  Nicolas Pitre, November 2014
+ * Copyright:   (C) 2014-2015  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/irq.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "internals.h"
+
+#include <trace/events/irq.h>
+
+
+/*
+ * This is the size of the IRQ interval window used to compute the
+ * mean interval and its variance.  This has to be at least 3 to still
+ * make sense.  Higher values may improve prediction confidence but more
+ * false negatives are to be expected.
+ */
+#define IRQT_INTERVAL_WINDOW	3
+
+
+struct irqt_prediction {
+	struct list_head node;
+	ktime_t		 time;		/* expected occurrence time */
+	int		 cpu;		/* CPU for which this was queued for */
+};
+
+struct irqt_stat {
+	ktime_t		last_time;	/* previous IRQ occurrence */
+	u64		n_M2;		/* IRQ interval variance (n scaled) */
+	u32		n_mean;		/* IRQ mean interval (n scaled) */
+	u32     	intervals[IRQT_INTERVAL_WINDOW];
+					/* window of recent IRQ intervals */
+	unsigned int	w_ptr;		/* current window pointer */
+	u32		predictable;	/* # of IRQs that were predictable */
+	u32		unpredictable;	/* # of IRQs that were not */
+	struct irqt_prediction prediction;
+};
+
+static DEFINE_PER_CPU(struct list_head, irqt_predictions);
+static DEFINE_PER_CPU(raw_spinlock_t, irqt_predictions_lock);
+
+void __init irqt_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		INIT_LIST_HEAD(&per_cpu(irqt_predictions, cpu));
+		raw_spin_lock_init(&per_cpu(irqt_predictions_lock, cpu));
+	}
+}
+
+/*
+ * Purge past events.
+ * Caller must take care of locking.
+ */
+static void irqt_purge(ktime_t now, struct list_head *head)
+{
+	struct irqt_prediction *entry, *n;
+
+	list_for_each_entry_safe(entry, n, head, node) {
+		if (ktime_after(entry->time, now))
+			break;
+		list_del_init(&entry->node);
+	}
+}
+
+/*
+ * Enqueue the next predicted event for this IRQ on this CPU.
+ * We are in interrupt context with IRQs disabled.
+ */
+static void irqt_enqueue_prediction(ktime_t now, struct irqt_stat *s)
+{
+	int this_cpu = raw_smp_processor_id();
+	int prev_cpu = s->prediction.cpu;
+	struct list_head *head = &per_cpu(irqt_predictions, this_cpu);
+	u32 predicted_interval = s->n_mean / IRQT_INTERVAL_WINDOW;
+	struct irqt_prediction *list_entry, *new_entry;
+	raw_spinlock_t *lock;
+
+	if (unlikely(prev_cpu != this_cpu && prev_cpu != -1)) {
+		lock = &per_cpu(irqt_predictions_lock, prev_cpu);
+		raw_spin_lock(lock);
+		list_del_init(&s->prediction.node);
+		raw_spin_unlock(lock);
+	}
+		
+	lock = &per_cpu(irqt_predictions_lock, this_cpu);
+	raw_spin_lock(lock);
+	irqt_purge(now, head);
+	__list_del_entry(&s->prediction.node);
+	new_entry = &s->prediction;
+	new_entry->time = ktime_add_us(now, predicted_interval);
+	new_entry->cpu = this_cpu;
+	list_for_each_entry(list_entry, head, node)
+		if (ktime_after(new_entry->time, list_entry->time))
+			break;
+	list_add_tail(&new_entry->node, &list_entry->node);
+	raw_spin_unlock(lock);
+}
+
+/**
+ * irqt_get_next_prediction - get relative time before next predicted IRQ
+ *
+ * @cpu: the CPU number for which a prediction is wanted
+ *
+ * This returns the relative time in microsecs before the next expected IRQ
+ * on given CPU, or zero if no prediction is available.  Those predictions
+ * are not guaranteed to be reliable, and guaranteed to fail from time to
+ * time i.e. when the predicted IRQ simply never comes, etc.
+ */
+s64 irqt_get_next_prediction(int cpu)
+{
+	raw_spinlock_t *lock = &per_cpu(irqt_predictions_lock, cpu);
+	struct list_head *head = &per_cpu(irqt_predictions, cpu);
+	unsigned long flags;
+	ktime_t now;
+	struct irqt_prediction *next;
+	s64 result;
+
+	raw_spin_lock_irqsave(lock, flags);
+	now = ktime_get();
+	irqt_purge(now, head);
+	next = list_first_entry_or_null(head, struct irqt_prediction, node);
+	result = next ? ktime_us_delta(next->time, now) : 0;
+	raw_spin_unlock_irqrestore(lock, flags);
+	return result;
+}
+
+/*
+ * irqt_process - update timing interval statistics for the given IRQ
+ *
+ * @irq: the IRQ number
+ * @stat: the corresponding IRQ timing stats record
+ *
+ * This is assumed to be called in IRQ context with desc->lock held and
+ * IRQs turned off.
+ */
+void irqt_process(unsigned int irq, struct irqt_stat *s)
+{
+	ktime_t now = ktime_get();
+	ktime_t ktime_interval = ktime_sub(now, s->last_time);
+	u32 oldX, newX, n = IRQT_INTERVAL_WINDOW;
+	s32 delta, n_dold, n_dnew;
+
+	s->last_time = now;
+
+	/* An interval needs at least two events */
+	if (unlikely(ktime_equal(now, ktime_interval)))
+		return;
+
+	/*
+	 * There is no point attempting predictions on interrupts more
+	 * than 1 second apart. This has no benefit for sleep state
+	 * selection and increases the risk of overflowing our variance
+	 * computation.  Reset all stats in that case.
+	 */
+	if (unlikely(ktime_after(ktime_interval, ktime_set(1, 0)))) {
+		s->n_mean = 0;
+		return;
+	}
+
+	/* microsecs is good enough */
+	newX = ktime_to_us(ktime_interval);
+
+	/* Seed the stats with the first interval */
+	if (unlikely(!s->n_mean)) {
+		int i;
+		s->n_M2 = 0;
+		s->n_mean = newX * n;
+		for (i = 0; i < IRQT_INTERVAL_WINDOW; i++)
+			s->intervals[i] = newX;
+		s->predictable = s->unpredictable = 0;
+		return;
+	}
+
+	/* Replace the oldest interval in our window */
+	oldX = s->intervals[s->w_ptr];
+	s->intervals[s->w_ptr] = newX;
+	s->w_ptr = (s->w_ptr + 1) % IRQT_INTERVAL_WINDOW;
+
+	/*
+	 * The variance gives us an instantaneous deviation from the
+	 * mean interval value.  Given x a new inter-IRQ interval and n the
+	 * number of such intervals to date:
+	 *
+	 *	n = n + 1
+	 *	delta = x - mean
+	 *	mean = mean + delta/n
+	 *	M2 = M2 + delta*(x - mean)
+	 *
+	 *	variance = M2/(n - 1)
+	 *
+	 * We want to update the variance over a window of recent intervals
+	 * in order to stay current with changing IRQ patterns.  To remove
+	 * the contribution from a sample x:
+	 *
+	 *	n = n - 1
+	 *	delta = x - mean
+	 *	mean = mean - delta/n
+	 *	M2 = M2 - delta*(x - mean)
+	 *
+	 * Combining those equations, we update both the mean and
+	 * variance by removing the contribution from the oldest window
+	 * sample and adding the latest one at the same time:
+	 *
+	 *	delta = newX - oldX
+	 *	dold = oldX - mean
+	 *	mean = mean + delta/n
+	 *	dnew = newX - mean
+	 *	M2 = M2 + delta * (dold + dnew)
+	 *
+	 * Ref:
+	 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+	 *
+	 * However this is unstable if performed with integer math due to
+	 * the accumulation of bit truncation errors caused by the division.
+	 * To avoid that, let's factor out the division.  Assuming
+	 * n_mean = n * mean:
+	 *
+	 *	delta = newX - oldX
+	 *	n_dold = n * oldX - n_mean
+	 *	n_mean = n_mean + delta
+	 *	n_dnew = n * newX - n_mean
+	 *	n_M2 = n_M2 + delta * (n_dold + n_dnew)
+	 *
+	 *	variance = n_M2/n / (n - 1)
+	 *
+	 * To make things as efficient as possible, we keep our window
+	 * size constant: n = IRQT_INTERVAL_WINDOW.
+	 */
+	delta = newX - oldX;
+	n_dold = n*oldX - s->n_mean;
+	s->n_mean += delta;
+	n_dnew = n*newX - s->n_mean;
+	s->n_M2 += (s64)delta * (n_dold + n_dnew);
+
+	/*
+	 * Let's determine if this interrupt actually happened after a 
+	 * periodic interval.  We treat a standard deviation greater than
+	 * the mean value as a signal that the current interval is no longer
+	 * stable enough to be predictable.
+	 *
+	 * 	mean < SD  -->  mean < sqrt(variance)  -->  mean^2 < variance
+	 *
+	 * 	n_mean/n * n_mean/n < n_M2/n / (n - 1)  -->
+	 * 	n_mean * n_mean * (n - 1) < n_M2 * n
+	 */
+	if ((u64)s->n_mean * s->n_mean * (n - 1) > s->n_M2 * n) {
+		s->predictable++;
+		if (s->predictable >= IRQT_INTERVAL_WINDOW)
+			irqt_enqueue_prediction(now, s);
+	} else {
+		s->predictable = 0;
+		s->unpredictable++;
+	}
+
+	trace_irq_timings(irq, newX, div_u64(s->n_M2, n*(n-1)), s->n_mean/n,
+			 s->predictable, s->unpredictable);
+}
+
+/*
+ * Called from __setup_irq() after successful registration of a new action
+ * handler.
+ */
+int irqt_register(struct irq_desc *desc)
+{
+	struct irqt_stat *s;
+	unsigned long flags;
+	int ret;
+
+	if (desc->irq_timings)
+		return 0;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&s->prediction.node);
+	s->prediction.cpu = -1;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	if (desc->irq_timings) {
+		/* someone else raced ahead of us */
+		ret = 0;
+	} else if (!desc->action) {
+		/* unused IRQ? */
+		ret = -ENXIO;
+	} else if (irq_settings_is_per_cpu(desc)) {
+		/* we're not set for per-CPU accounting */
+		pr_warn("IRQ %d: can't do timing stats on per-CPU IRQs\n",
+			desc->action->irq);
+		ret = -ENOSYS;
+	} else {
+		desc->irq_timings = s;
+		s = NULL;
+		ret = 0;
+	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	if (s)
+		kfree(s);
+	return ret;
+}
+
+/*
+ * Called from __free_irq() when there is no longer any handler attached
+ * to the IRQ descriptor. Must be called with desc->lock held.
+ */
+void irqt_unregister(struct irq_desc *desc)
+{
+	struct irqt_stat *s;
+	int cpu;
+	raw_spinlock_t *lock;
+
+	assert_raw_spin_locked(&desc->lock);
+	if (!desc->irq_timings)
+		return;
+	s = desc->irq_timings;
+	desc->irq_timings = NULL;
+	cpu = s->prediction.cpu;
+	if (cpu != -1) {
+		lock = &per_cpu(irqt_predictions_lock, cpu);
+		raw_spin_lock(lock);
+		__list_del_entry(&s->prediction.node);
+		raw_spin_unlock(lock);
+	}
+	kfree(s);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b4976ab65b39..ce4b31bc7f28 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4968,21 +4968,45 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		if (idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
-			if (idle && idle->exit_latency < min_exit_latency) {
+
+			if (idle) {
+
 				/*
-				 * We give priority to a CPU whose idle state
-				 * has the smallest exit latency irrespective
-				 * of any idle timestamp.
+				 * When we want to save energy, exclude cpu which did not reach
+				 * the break even point in the idle state
 				 */
-				min_exit_latency = idle->exit_latency;
-				latest_idle_timestamp = rq->idle_stamp;
-				shallowest_idle_cpu = i;
-			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
-				   rq->idle_stamp > latest_idle_timestamp) {
+				if (sched_feat(ENERGY_IDLE) &&
+				    ((ktime_to_us(ktime_get()) - idle->idle_stamp <
+				      idle->target_residency)))
+						continue;
+
+				if (idle->exit_latency < min_exit_latency) {
+					/*
+					 * We give priority to a CPU
+					 * whose idle state has the
+					 * smallest exit latency
+					 * irrespective of any idle
+					 * timestamp.
+					 */
+					min_exit_latency = idle->exit_latency;
+					latest_idle_timestamp = idle->idle_stamp;
+					shallowest_idle_cpu = i;
+				} else if (idle->exit_latency == min_exit_latency &&
+					   idle->idle_stamp > latest_idle_timestamp) {
+					/*
+					 * If the CPU is in the same
+					 * idle state, choose the more
+					 * recent one as it might have
+					 * a warmer cache
+					 */
+					latest_idle_timestamp = idle->idle_stamp;
+					shallowest_idle_cpu = i;
+				}
+			} else if (rq->idle_stamp > latest_idle_timestamp) {
 				/*
-				 * If equal or no active idle state, then
-				 * the most recently idled CPU might have
-				 * a warmer cache.
+				 * If no active idle state, then the
+				 * most recent idled CPU might have a
+				 * warmer cache
 				 */
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
@@ -4996,7 +5020,15 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		}
 	}
 
-	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+	/*
+	 * If there is a non idle cpu different from the current one,
+	 * let's use it if we want to save energy by not waking up an
+	 * idle cpu, otherwise let's use the shallowest idle cpu
+	 */
+	if (sched_feat(ENERGY_IDLE) && least_loaded_cpu != this_cpu)
+		return least_loaded_cpu;
+	else
+		return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..b14f8ebcf4b1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,6 +61,11 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 
 /*
+ * Apply energy saving agressive policy when idle
+ */
+SCHED_FEAT(ENERGY_IDLE, true)
+
+/*
  * Apply the automatic NUMA scheduling policy. Enabled automatically
  * at runtime if running on a NUMA machine. Can be controlled via
  * numa_balancing=
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..ffef99b2ad03 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -5,6 +5,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 #include <linux/tick.h>
+#include <linux/pm_qos.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
 
@@ -42,18 +43,6 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
 __setup("hlt", cpu_idle_nopoll_setup);
 #endif
 
-static inline int cpu_idle_poll(void)
-{
-	rcu_idle_enter();
-	trace_cpu_idle_rcuidle(0, smp_processor_id());
-	local_irq_enable();
-	while (!tif_need_resched())
-		cpu_relax();
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
-	rcu_idle_exit();
-	return 1;
-}
-
 /* Weak implementations for optional arch specific functions */
 void __weak arch_cpu_idle_prepare(void) { }
 void __weak arch_cpu_idle_enter(void) { }
@@ -65,6 +54,23 @@ void __weak arch_cpu_idle(void)
 	local_irq_enable();
 }
 
+void __weak arch_cpu_idle_poll(void)
+{
+	local_irq_enable();
+	while (!tif_need_resched())
+		cpu_relax();
+}
+
+static inline int cpu_idle_poll(void)
+{
+	rcu_idle_enter();
+	trace_cpu_idle_rcuidle(0, smp_processor_id());
+	arch_cpu_idle_poll();
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	rcu_idle_exit();
+	return 1;
+}
+
 /**
  * cpuidle_idle_call - the main idle function
  *
@@ -74,7 +80,7 @@ void __weak arch_cpu_idle(void)
  * set, and it returns with polling set.  If it ever stops polling, it
  * must clear the polling bit.
  */
-static void cpuidle_idle_call(void)
+static void cpuidle_idle_call(unsigned int latency_req, s64 next_timer_event)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
@@ -107,7 +113,7 @@ static void cpuidle_idle_call(void)
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
 	 */
-	next_state = cpuidle_select(drv, dev);
+	next_state = cpuidle_select(drv, dev, latency_req, next_timer_event);
 	if (next_state < 0) {
 use_default:
 		/*
@@ -166,7 +172,8 @@ use_default:
 	/*
 	 * Give the governor an opportunity to reflect on the outcome
 	 */
-	cpuidle_reflect(dev, entered_state);
+	if (entered_state >= 0)
+		cpuidle_reflect(dev, entered_state);
 
 exit_idle:
 	__current_set_polling();
@@ -188,6 +195,9 @@ exit_idle:
  */
 static void cpu_idle_loop(void)
 {
+	unsigned int latency_req;
+	s64 next_timer_event;
+
 	while (1) {
 		/*
 		 * If the arch has a polling bit, we maintain an invariant:
@@ -211,19 +221,30 @@ static void cpu_idle_loop(void)
 			local_irq_disable();
 			arch_cpu_idle_enter();
 
+			latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+
+			next_timer_event =
+				ktime_to_us(tick_nohz_get_sleep_length());
+
 			/*
 			 * In poll mode we reenable interrupts and spin.
 			 *
+			 * If the latency req is zero, we don't want to
+			 * enter any idle state and we jump to the poll
+			 * function directly
+			 *
 			 * Also if we detected in the wakeup from idle
 			 * path that the tick broadcast device expired
 			 * for us, we don't want to go deep idle as we
 			 * know that the IPI is going to arrive right
 			 * away
 			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired())
+			if (!latency_req || cpu_idle_force_poll ||
+			    tick_check_broadcast_expired())
 				cpu_idle_poll();
 			else
-				cpuidle_idle_call();
+				cpuidle_idle_call(latency_req,
+						  next_timer_event);
 
 			arch_cpu_idle_exit();
 		}
author	Vincent Guittot <vincent.guittot@linaro.org>	2015-02-02 13:38:27 +0100
committer	Vincent Guittot <vincent.guittot@linaro.org>	2015-02-02 13:38:27 +0100
commit	535883342fd96de91597048a78158faf8e2c59c6 (patch)
tree	b13e25cf53e374416a1efc1b01e185ff8ccf7446
parent	15bc600daf64ec7478988611650bb18c5a03664d (diff)
parent	30ae92e267453cc86953200c0f78db0b0995d631 (diff)