diff options
author | Vincent Guittot <vincent.guittot@linaro.org> | 2015-02-02 13:38:27 +0100 |
---|---|---|
committer | Vincent Guittot <vincent.guittot@linaro.org> | 2015-02-02 13:38:27 +0100 |
commit | 535883342fd96de91597048a78158faf8e2c59c6 (patch) | |
tree | b13e25cf53e374416a1efc1b01e185ff8ccf7446 | |
parent | 15bc600daf64ec7478988611650bb18c5a03664d (diff) | |
parent | 30ae92e267453cc86953200c0f78db0b0995d631 (diff) |
Merge branch 'test-sched-cpuidle-tc2' into test-sched-all-tc2
-rw-r--r-- | drivers/acpi/processor_idle.c | 10 | ||||
-rw-r--r-- | drivers/ata/libata-sff.c | 9 | ||||
-rw-r--r-- | drivers/ata/pata_mpiix.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/Kconfig | 4 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 92 | ||||
-rw-r--r-- | drivers/cpuidle/driver.c | 32 | ||||
-rw-r--r-- | drivers/cpuidle/governors/Makefile | 1 | ||||
-rw-r--r-- | drivers/cpuidle/governors/irq.c | 33 | ||||
-rw-r--r-- | drivers/cpuidle/governors/ladder.c | 32 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 205 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 156 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 2 | ||||
-rw-r--r-- | include/linux/cpuidle.h | 35 | ||||
-rw-r--r-- | include/linux/interrupt.h | 1 | ||||
-rw-r--r-- | include/linux/irq.h | 6 | ||||
-rw-r--r-- | include/linux/irqdesc.h | 5 | ||||
-rw-r--r-- | include/trace/events/irq.h | 45 | ||||
-rw-r--r-- | kernel/irq/Kconfig | 5 | ||||
-rw-r--r-- | kernel/irq/Makefile | 1 | ||||
-rw-r--r-- | kernel/irq/internals.h | 18 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 2 | ||||
-rw-r--r-- | kernel/irq/manage.c | 4 | ||||
-rw-r--r-- | kernel/irq/proc.c | 67 | ||||
-rw-r--r-- | kernel/irq/timings.c | 338 | ||||
-rw-r--r-- | kernel/sched/fair.c | 58 | ||||
-rw-r--r-- | kernel/sched/features.h | 5 | ||||
-rw-r--r-- | kernel/sched/idle.c | 55 |
27 files changed, 989 insertions, 234 deletions
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 87b704e41877..5d0bcd21a822 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -782,7 +782,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) - return acpi_idle_enter_c1(dev, drv, CPUIDLE_DRIVER_STATE_START); + return acpi_idle_enter_c1(dev, drv, 0); #endif /* @@ -830,7 +830,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) - return acpi_idle_enter_c1(dev, drv, CPUIDLE_DRIVER_STATE_START); + return acpi_idle_enter_c1(dev, drv, 0); #endif if (!cx->bm_sts_skip && acpi_idle_bm_check()) { @@ -905,7 +905,7 @@ struct cpuidle_driver acpi_idle_driver = { static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, struct cpuidle_device *dev) { - int i, count = CPUIDLE_DRIVER_STATE_START; + int i, count = 0; struct acpi_processor_cx *cx; if (!pr->flags.power_setup_done) @@ -950,7 +950,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, */ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) { - int i, count = CPUIDLE_DRIVER_STATE_START; + int i, count = 0; struct acpi_processor_cx *cx; struct cpuidle_state *state; struct cpuidle_driver *drv = &acpi_idle_driver; @@ -985,6 +985,8 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) state->flags = 0; switch (cx->type) { case ACPI_STATE_C1: + if (cx->entry_method != ACPI_CSTATE_FFH) + state->flags |= CPUIDLE_FLAG_TIME_INVALID; state->enter = acpi_idle_enter_c1; state->enter_dead = acpi_idle_play_dead; diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index db90aa35cb71..062297326f07 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -2440,7 +2440,8 @@ int ata_pci_sff_activate_host(struct ata_host *host, int i; rc = devm_request_irq(dev, pdev->irq, irq_handler, - IRQF_SHARED, drv_name, host); + IRQF_SHARED | IRQF_TIMINGS, + drv_name, host); if (rc) goto out; @@ -2452,7 +2453,8 @@ int ata_pci_sff_activate_host(struct ata_host *host, } else if (legacy_mode) { if (!ata_port_is_dummy(host->ports[0])) { rc = devm_request_irq(dev, ATA_PRIMARY_IRQ(pdev), - irq_handler, IRQF_SHARED, + irq_handler, + IRQF_SHARED | IRQF_TIMINGS, drv_name, host); if (rc) goto out; @@ -2463,7 +2465,8 @@ int ata_pci_sff_activate_host(struct ata_host *host, if (!ata_port_is_dummy(host->ports[1])) { rc = devm_request_irq(dev, ATA_SECONDARY_IRQ(pdev), - irq_handler, IRQF_SHARED, + irq_handler, + IRQF_SHARED | IRQF_TIMINGS, drv_name, host); if (rc) goto out; diff --git a/drivers/ata/pata_mpiix.c b/drivers/ata/pata_mpiix.c index 202b4d601393..b7663242cd60 100644 --- a/drivers/ata/pata_mpiix.c +++ b/drivers/ata/pata_mpiix.c @@ -208,7 +208,7 @@ static int mpiix_init_one(struct pci_dev *dev, const struct pci_device_id *id) ata_sff_std_ports(&ap->ioaddr); /* activate host */ - return ata_host_activate(host, irq, ata_sff_interrupt, IRQF_SHARED, + return ata_host_activate(host, irq, ata_sff_interrupt, IRQF_SHARED | IRQF_TIMINGS, &mpiix_sht); } diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index c5029c1209b4..1aae78bc9d95 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -25,6 +25,10 @@ config CPU_IDLE_GOV_MENU bool "Menu governor (for tickless system)" default y +config CPU_IDLE_GOV_IRQ + bool "Irq governor (for tickless system)" + default y + config DT_IDLE_STATES bool diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 125150dc6e81..1924f4e0a3e7 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -8,16 +8,12 @@ * This code is licenced under the GPL. */ -#include <linux/clockchips.h> #include <linux/kernel.h> #include <linux/mutex.h> -#include <linux/sched.h> #include <linux/notifier.h> #include <linux/pm_qos.h> #include <linux/cpu.h> #include <linux/cpuidle.h> -#include <linux/ktime.h> -#include <linux/hrtimer.h> #include <linux/module.h> #include <trace/events/power.h> @@ -58,7 +54,7 @@ int cpuidle_play_dead(void) return -ENODEV; /* Find lowest-power state that supports long-term idle */ - for (i = drv->state_count - 1; i >= CPUIDLE_DRIVER_STATE_START; i--) + for (i = drv->state_count - 1; i >= 0; i--) if (drv->states[i].enter_dead) return drv->states[i].enter_dead(dev, i); @@ -81,24 +77,33 @@ void cpuidle_use_deepest_state(bool enable) } /** - * cpuidle_find_deepest_state - Find the state of the greatest exit latency. + * cpuidle_find_state - Find an idle state given the constraints + * * @drv: cpuidle driver for a given CPU. * @dev: cpuidle device for a given CPU. + * + * Returns an index of the state fulfilling the time constraint passed as + * parameter, -1 otherwise + * */ -static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) +int cpuidle_find_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, + unsigned int sleep_time, unsigned int latency_req) { - unsigned int latency_req = 0; - int i, ret = CPUIDLE_DRIVER_STATE_START - 1; + int i, ret = -1; - for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + for (i = 0; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; struct cpuidle_state_usage *su = &dev->states_usage[i]; - if (s->disabled || su->disable || s->exit_latency <= latency_req) + if (s->disabled || su->disable) + continue; + + if (s->target_residency > sleep_time) + continue; + + if (s->exit_latency > latency_req) continue; - latency_req = s->exit_latency; ret = i; } return ret; @@ -116,21 +121,54 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int entered_state; struct cpuidle_state *target_state = &drv->states[index]; - ktime_t time_start, time_end; s64 diff; trace_cpu_idle_rcuidle(index, dev->cpu); - time_start = ktime_get(); + /* + * Store the idle start time for this cpu, this information + * will be used by cpuidle to measure how long the cpu has + * been idle and by the scheduler to prevent to wake it up too + * early + */ + target_state->idle_stamp = ktime_to_us(ktime_get()); + + /* + * The enter the low level idle routine. This call will block + * until an interrupt occurs meaning it is the end of the idle + * period + */ entered_state = target_state->enter(dev, drv, index); - time_end = ktime_get(); + /* + * Measure as soon as possible the duration of the idle + * period. It MUST be done before re-enabling the interrupt in + * order to prevent to add in the idle time measurement the + * interrupt handling duration + */ + diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp)); + + /* + * Reset the idle time stamp as the scheduler thinks the cpu is idle + * while it is in the process of waking up + */ + target_state->idle_stamp = 0; + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + /* + * The cpuidle_enter_coupled uses the cpuidle_enter function. + * Don't re-enable the interrupts and let the enter_coupled + * function to wait for all cpus to sync and to enable the + * interrupts again from there + */ if (!cpuidle_state_is_coupled(dev, drv, entered_state)) local_irq_enable(); - diff = ktime_to_us(ktime_sub(time_end, time_start)); + /* + * The idle duration will be casted to an integer, prevent to + * overflow by setting a boundary to INT_MAX + */ if (diff > INT_MAX) diff = INT_MAX; @@ -143,6 +181,16 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ dev->states_usage[entered_state].time += dev->last_residency; dev->states_usage[entered_state].usage++; + + if (diff < drv->states[entered_state].target_residency) { + atomic_inc(&dev->over_estimate); + } else if (entered_state < (drv->state_count - 1) && + diff >= + drv->states[entered_state + 1].target_residency) { + atomic_inc(&dev->under_estimate); + } else { + atomic_inc(&dev->right_estimate); + } } else { dev->last_residency = 0; } @@ -155,10 +203,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * * @drv: the cpuidle driver * @dev: the cpuidle device + * @latency_req: the latency constraint when choosing an idle state + * @next_timer_event: the duration until the timer expires * * Returns the index of the idle state. */ -int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int latency_req, s64 next_timer_event) { if (off || !initialized) return -ENODEV; @@ -167,9 +218,10 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) return -EBUSY; if (unlikely(use_deepest_state)) - return cpuidle_find_deepest_state(drv, dev); + return cpuidle_find_state(drv, dev, UINT_MAX, UINT_MAX); - return cpuidle_curr_governor->select(drv, dev); + return cpuidle_curr_governor->select(drv, dev, latency_req, + next_timer_event); } /** diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 2697e87d5b34..16d67aa1c801 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -178,36 +178,6 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) } } -#ifdef CONFIG_ARCH_HAS_CPU_RELAX -static int poll_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) -{ - local_irq_enable(); - if (!current_set_polling_and_test()) { - while (!need_resched()) - cpu_relax(); - } - current_clr_polling(); - - return index; -} - -static void poll_idle_init(struct cpuidle_driver *drv) -{ - struct cpuidle_state *state = &drv->states[0]; - - snprintf(state->name, CPUIDLE_NAME_LEN, "POLL"); - snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); - state->exit_latency = 0; - state->target_residency = 0; - state->power_usage = -1; - state->enter = poll_idle; - state->disabled = false; -} -#else -static void poll_idle_init(struct cpuidle_driver *drv) {} -#endif /* !CONFIG_ARCH_HAS_CPU_RELAX */ - /** * __cpuidle_register_driver: register the driver * @drv: a valid pointer to a struct cpuidle_driver @@ -241,8 +211,6 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv) on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer, (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1); - poll_idle_init(drv); - return 0; } diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 1b512722689f..8804ee2f550c 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o +obj-$(CONFIG_CPU_IDLE_GOV_IRQ) += irq.o diff --git a/drivers/cpuidle/governors/irq.c b/drivers/cpuidle/governors/irq.c new file mode 100644 index 000000000000..de99f4545255 --- /dev/null +++ b/drivers/cpuidle/governors/irq.c @@ -0,0 +1,33 @@ +/* + * irq.c - the irq governor + * + * Copyright (C) 2014 Daniel Lezcano <daniel.lezcano@linaro.org> + * +*/ +#include <linux/ktime.h> +#include <linux/irq.h> +#include <linux/cpuidle.h> + +static int select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int latency_req, s64 next_timer_event) +{ + s64 next_irq_event = irqt_get_next_prediction(dev->cpu); + s64 next_event = next_irq_event ? + min(next_irq_event, next_timer_event) : next_timer_event; + + return cpuidle_find_state(drv, dev, next_event, latency_req); +} + +static struct cpuidle_governor irq_governor = { + .name = "irq", + .rating = 30, + .select = select, + .owner = THIS_MODULE, +}; + +static int __init irq_init(void) +{ + return cpuidle_register_governor(&irq_governor); +} + +postcore_initcall(irq_init); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 401c0106ed34..5a76dff631b9 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -64,22 +64,21 @@ static inline void ladder_do_selection(struct ladder_device *ldev, * @dev: the CPU */ static int ladder_select_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + int latency_req, s64 next_timer_event) { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct ladder_device_state *last_state; int last_residency, last_idx = ldev->last_state_idx; - int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); - - /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) { - ladder_do_selection(ldev, last_idx, 0); - return 0; - } last_state = &ldev->states[last_idx]; - last_residency = cpuidle_get_last_residency(dev) - drv->states[last_idx].exit_latency; + if (!(drv->states[last_idx].flags & CPUIDLE_FLAG_TIME_INVALID)) { + last_residency = cpuidle_get_last_residency(dev) - \ + drv->states[last_idx].exit_latency; + } + else + last_residency = last_state->threshold.promotion_time + 1; /* consider promotion */ if (last_idx < drv->state_count - 1 && @@ -96,13 +95,13 @@ static int ladder_select_state(struct cpuidle_driver *drv, } /* consider demotion */ - if (last_idx > CPUIDLE_DRIVER_STATE_START && + if (last_idx > 0 && (drv->states[last_idx].disabled || dev->states_usage[last_idx].disable || drv->states[last_idx].exit_latency > latency_req)) { int i; - for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) { + for (i = last_idx - 1; i > 0; i--) { if (drv->states[i].exit_latency <= latency_req) break; } @@ -110,7 +109,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, return i; } - if (last_idx > CPUIDLE_DRIVER_STATE_START && + if (last_idx > 0 && last_residency < last_state->threshold.demotion_time) { last_state->stats.demotion_count++; last_state->stats.promotion_count = 0; @@ -137,9 +136,9 @@ static int ladder_enable_device(struct cpuidle_driver *drv, struct ladder_device_state *lstate; struct cpuidle_state *state; - ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START; + ldev->last_state_idx = 0; - for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + for (i = 0; i < drv->state_count; i++) { state = &drv->states[i]; lstate = &ldev->states[i]; @@ -151,7 +150,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv, if (i < drv->state_count - 1) lstate->threshold.promotion_time = state->exit_latency; - if (i > CPUIDLE_DRIVER_STATE_START) + if (i > 0) lstate->threshold.demotion_time = state->exit_latency; } @@ -166,8 +165,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv, static void ladder_reflect(struct cpuidle_device *dev, int index) { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); - if (index > 0) - ldev->last_state_idx = index; + ldev->last_state_idx = index; } static struct cpuidle_governor ladder_governor = { diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 40580794e23d..ac2be02de5c6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -13,10 +13,6 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> #include <linux/pm_qos.h> -#include <linux/time.h> -#include <linux/ktime.h> -#include <linux/hrtimer.h> -#include <linux/tick.h> #include <linux/sched.h> #include <linux/math64.h> #include <linux/module.h> @@ -188,7 +184,6 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo static DEFINE_PER_CPU(struct menu_device, menu_devices); -static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); /* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */ static u64 div_round64(u64 dividend, u32 divisor) @@ -196,13 +191,87 @@ static u64 div_round64(u64 dividend, u32 divisor) return div_u64(dividend + (divisor / 2), divisor); } +/** + * menu_update - attempts to guess what happened after entry + * @drv: cpuidle driver containing state data + * @dev: the CPU + */ +static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + struct menu_device *data = this_cpu_ptr(&menu_devices); + int last_idx = data->last_state_idx; + struct cpuidle_state *target = &drv->states[last_idx]; + unsigned int measured_us; + unsigned int new_factor; + + /* + * Try to figure out how much time passed between entry to low + * power state and occurrence of the wakeup event. + * + * If the entered idle state didn't support residency measurements, + * we are basically lost in the dark how much time passed. + * As a compromise, assume we slept for the whole expected time. + * + * Any measured amount of time will include the exit latency. + * Since we are interested in when the wakeup begun, not when it + * was completed, we must subtract the exit latency. However, if + * the measured amount of time is less than the exit latency, + * assume the state was never reached and the exit latency is 0. + */ + if (unlikely(target->flags & CPUIDLE_FLAG_TIME_INVALID)) { + /* Use timer value as is */ + measured_us = data->next_timer_us; + + } else { + /* Use measured value */ + measured_us = cpuidle_get_last_residency(dev); + + /* Deduct exit latency */ + if (measured_us > target->exit_latency) + measured_us -= target->exit_latency; + + /* Make sure our coefficients do not exceed unity */ + if (measured_us > data->next_timer_us) + measured_us = data->next_timer_us; + } + + /* Update our correction ratio */ + new_factor = data->correction_factor[data->bucket]; + new_factor -= new_factor / DECAY; + + if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING) + new_factor += RESOLUTION * measured_us / data->next_timer_us; + else + /* + * we were idle so long that we count it as a perfect + * prediction + */ + new_factor += RESOLUTION; + + /* + * We don't want 0 as factor; we always want at least + * a tiny bit of estimated time. Fortunately, due to rounding, + * new_factor will stay nonzero regardless of measured_us values + * and the compiler can eliminate this test as long as DECAY > 1. + */ + if (DECAY == 1 && unlikely(new_factor == 0)) + new_factor = 1; + + data->correction_factor[data->bucket] = new_factor; + + /* update the repeating-pattern data */ + data->intervals[data->interval_ptr++] = measured_us; + if (data->interval_ptr >= INTERVALS) + data->interval_ptr = 0; +} + /* * Try detecting repeating patterns by keeping track of the last 8 * intervals, and checking if the standard deviation of that set * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static void get_typical_interval(struct menu_device *data) +static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; unsigned int max, thresh; @@ -259,11 +328,8 @@ again: if (likely(stddev <= ULONG_MAX)) { stddev = int_sqrt(stddev); if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3)) - || stddev <= 20) { - if (data->next_timer_us > avg) - data->predicted_us = avg; - return; - } + || stddev <= 20) + return avg; } /* @@ -276,7 +342,7 @@ again: * with sporadic activity with a bunch of short pauses. */ if ((divisor * 4) <= INTERVALS * 3) - return; + return 0; thresh = max - 1; goto again; @@ -287,12 +353,12 @@ again: * @drv: cpuidle driver containing state data * @dev: the CPU */ -static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int latency_req, s64 next_timer_event) { struct menu_device *data = this_cpu_ptr(&menu_devices); - int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); - int i; unsigned int interactivity_req; + unsigned int interactivity_overrride_us; unsigned long nr_iowaiters, cpu_load; if (data->needs_update) { @@ -300,14 +366,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->needs_update = 0; } - data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; - - /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) - return 0; + data->last_state_idx = 0; /* determine the expected residency time, round up */ - data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); + data->next_timer_us = next_timer_event; get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); @@ -321,7 +383,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->correction_factor[data->bucket], RESOLUTION * DECAY); - get_typical_interval(data); + interactivity_overrride_us = get_typical_interval(data); + if (interactivity_overrride_us && + data->next_timer_us > interactivity_overrride_us) + data->predicted_us = interactivity_overrride_us; /* * Performance multiplier defines a minimum predicted idle @@ -333,31 +398,11 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) latency_req = interactivity_req; /* - * We want to default to C1 (hlt), not to busy polling - * unless the timer is happening really really soon. - */ - if (data->next_timer_us > 5 && - !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && - dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) - data->last_state_idx = CPUIDLE_DRIVER_STATE_START; - - /* * Find the idle state with the lowest power while satisfying * our constraints. */ - for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { - struct cpuidle_state *s = &drv->states[i]; - struct cpuidle_state_usage *su = &dev->states_usage[i]; - - if (s->disabled || su->disable) - continue; - if (s->target_residency > data->predicted_us) - continue; - if (s->exit_latency > latency_req) - continue; - - data->last_state_idx = i; - } + data->last_state_idx = cpuidle_find_state(drv, dev, data->predicted_us, + latency_req); return data->last_state_idx; } @@ -374,77 +419,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) { struct menu_device *data = this_cpu_ptr(&menu_devices); data->last_state_idx = index; - if (index >= 0) - data->needs_update = 1; -} - -/** - * menu_update - attempts to guess what happened after entry - * @drv: cpuidle driver containing state data - * @dev: the CPU - */ -static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) -{ - struct menu_device *data = this_cpu_ptr(&menu_devices); - int last_idx = data->last_state_idx; - struct cpuidle_state *target = &drv->states[last_idx]; - unsigned int measured_us; - unsigned int new_factor; - - /* - * Try to figure out how much time passed between entry to low - * power state and occurrence of the wakeup event. - * - * If the entered idle state didn't support residency measurements, - * we use them anyway if they are short, and if long, - * truncate to the whole expected time. - * - * Any measured amount of time will include the exit latency. - * Since we are interested in when the wakeup begun, not when it - * was completed, we must subtract the exit latency. However, if - * the measured amount of time is less than the exit latency, - * assume the state was never reached and the exit latency is 0. - */ - - /* measured value */ - measured_us = cpuidle_get_last_residency(dev); - - /* Deduct exit latency */ - if (measured_us > target->exit_latency) - measured_us -= target->exit_latency; - - /* Make sure our coefficients do not exceed unity */ - if (measured_us > data->next_timer_us) - measured_us = data->next_timer_us; - - /* Update our correction ratio */ - new_factor = data->correction_factor[data->bucket]; - new_factor -= new_factor / DECAY; - - if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING) - new_factor += RESOLUTION * measured_us / data->next_timer_us; - else - /* - * we were idle so long that we count it as a perfect - * prediction - */ - new_factor += RESOLUTION; - - /* - * We don't want 0 as factor; we always want at least - * a tiny bit of estimated time. Fortunately, due to rounding, - * new_factor will stay nonzero regardless of measured_us values - * and the compiler can eliminate this test as long as DECAY > 1. - */ - if (DECAY == 1 && unlikely(new_factor == 0)) - new_factor = 1; - - data->correction_factor[data->bucket] = new_factor; - - /* update the repeating-pattern data */ - data->intervals[data->interval_ptr++] = measured_us; - if (data->interval_ptr >= INTERVALS) - data->interval_ptr = 0; + data->needs_update = 1; } /** diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 97c5903b4606..f446bd0fd9bd 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -439,6 +439,154 @@ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device) cpuidle_free_state_kobj(device, i); } +#define kobj_to_stats_kobj(k) container_of(k, struct cpuidle_stats_kobj, kobj) +#define attr_to_stats_attr(a) container_of(a, struct cpuidle_stats_attr, attr) + +#define define_show_stats_function(_name) \ + static ssize_t show_stats_##_name(struct cpuidle_device *dev, \ + char *buf) \ + { \ + return sprintf(buf, "%d\n", atomic_read(&dev->_name)); \ + } + +#define define_store_stats_function(_name) \ + static ssize_t store_stats_##_name(struct cpuidle_device *dev, \ + const char *buf, size_t size) \ + { \ + unsigned long long value; \ + int err; \ + if (!capable(CAP_SYS_ADMIN)) \ + return -EPERM; \ + err = kstrtoull(buf, 0, &value); \ + if (err) \ + return err; \ + \ + atomic_set(&dev->_name, value); \ + return size; \ + } + +#define define_one_stats_rw(_name, show, store) \ + static struct cpuidle_stats_attr attr_stats_##_name = \ + __ATTR(_name, 0644, show, store) + +struct cpuidle_stats_kobj { + struct cpuidle_device *dev; + struct completion kobj_unregister; + struct kobject kobj; +}; + +struct cpuidle_stats_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_device *, char *); + ssize_t (*store)(struct cpuidle_device *, const char *, size_t); +}; + +static void cpuidle_stats_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj); + complete(&stats_kobj->kobj_unregister); +} + +static ssize_t cpuidle_stats_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int ret = -EIO; + struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj); + struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr); + + if (dattr->show) + ret = dattr->show(stats_kobj->dev, buf); + + return ret; +} + +static ssize_t cpuidle_stats_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + int ret = -EIO; + struct cpuidle_stats_kobj *stats_kobj = kobj_to_stats_kobj(kobj); + struct cpuidle_stats_attr *dattr = attr_to_stats_attr(attr); + + if (dattr->store) + ret = dattr->store(stats_kobj->dev, buf, size); + + return ret; +} + +define_show_stats_function(right_estimate); +define_store_stats_function(right_estimate); + +define_show_stats_function(under_estimate); +define_store_stats_function(under_estimate); + +define_show_stats_function(over_estimate); +define_store_stats_function(over_estimate); + +define_one_stats_rw(right_estimate, + show_stats_right_estimate, + store_stats_right_estimate); + +define_one_stats_rw(under_estimate, + show_stats_under_estimate, + store_stats_under_estimate); + +define_one_stats_rw(over_estimate, + show_stats_over_estimate, + store_stats_over_estimate); + +static const struct sysfs_ops cpuidle_stats_sysfs_ops = { + .show = cpuidle_stats_show, + .store = cpuidle_stats_store, +}; + +static struct attribute *cpuidle_stats_default_attrs[] = { + &attr_stats_right_estimate.attr, + &attr_stats_under_estimate.attr, + &attr_stats_over_estimate.attr, + NULL +}; + +static struct kobj_type ktype_stats_cpuidle = { + .sysfs_ops = &cpuidle_stats_sysfs_ops, + .default_attrs = cpuidle_stats_default_attrs, + .release = cpuidle_stats_sysfs_release, +}; + +static int cpuidle_add_stats_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_stats_kobj *kstats; + struct cpuidle_device_kobj *kdev = dev->kobj_dev; + int ret; + + kstats = kzalloc(sizeof(*kstats), GFP_KERNEL); + if (!kstats) + return -ENOMEM; + + kstats->dev = dev; + init_completion(&kstats->kobj_unregister); + + ret = kobject_init_and_add(&kstats->kobj, &ktype_stats_cpuidle, + &kdev->kobj, "stats"); + if (ret) { + kfree(kstats); + return ret; + } + + kobject_uevent(&kstats->kobj, KOBJ_ADD); + dev->kobj_stats = kstats; + + return ret; +} + +static void cpuidle_remove_stats_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_stats_kobj *kstats = dev->kobj_stats; + kobject_put(&kstats->kobj); + wait_for_completion(&kstats->kobj_unregister); + kfree(kstats); +} + #ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS #define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj) #define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr) @@ -589,6 +737,13 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device) ret = cpuidle_add_driver_sysfs(device); if (ret) cpuidle_remove_state_sysfs(device); + + ret = cpuidle_add_stats_sysfs(device); + if (ret) { + cpuidle_remove_driver_sysfs(device); + cpuidle_remove_state_sysfs(device); + } + return ret; } @@ -598,6 +753,7 @@ int cpuidle_add_device_sysfs(struct cpuidle_device *device) */ void cpuidle_remove_device_sysfs(struct cpuidle_device *device) { + cpuidle_remove_stats_sysfs(device); cpuidle_remove_driver_sysfs(device); cpuidle_remove_state_sysfs(device); } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9cceacb92f9d..f735355e0df5 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -845,8 +845,6 @@ static int __init intel_idle_cpuidle_driver_init(void) intel_idle_state_table_update(); - drv->state_count = 1; - for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { int num_substates, mwait_hint, mwait_cstate; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index ab70f3bc44ad..e1f4914409b3 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -44,6 +44,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned int target_residency; /* in US */ bool disabled; /* disabled on all CPUs */ + u64 idle_stamp; int (*enter) (struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -53,6 +54,7 @@ struct cpuidle_state { }; /* Idle State Flags */ +#define CPUIDLE_FLAG_TIME_INVALID (0x01) /* is residency time measurable? */ #define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ #define CPUIDLE_FLAG_TIMER_STOP (0x04) /* timer is stopped on this state */ @@ -61,6 +63,7 @@ struct cpuidle_state { struct cpuidle_device_kobj; struct cpuidle_state_kobj; struct cpuidle_driver_kobj; +struct cpuidle_stats_kobj; struct cpuidle_device { unsigned int registered:1; @@ -73,8 +76,13 @@ struct cpuidle_device { struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; struct cpuidle_driver_kobj *kobj_driver; struct cpuidle_device_kobj *kobj_dev; + struct cpuidle_stats_kobj *kobj_stats; struct list_head device_list; + atomic_t right_estimate; + atomic_t under_estimate; + atomic_t over_estimate; + #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED int safe_state_index; cpumask_t coupled_cpus; @@ -88,6 +96,8 @@ DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); /** * cpuidle_get_last_residency - retrieves the last state's residency time * @dev: the target CPU + * + * NOTE: this value is invalid if CPUIDLE_FLAG_TIME_INVALID is set */ static inline int cpuidle_get_last_residency(struct cpuidle_device *dev) { @@ -118,8 +128,14 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); +extern int cpuidle_find_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + unsigned int sleep_time, + unsigned int latency_req); + extern int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + int latency_req, s64 next_timer_event); extern int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index); extern void cpuidle_reflect(struct cpuidle_device *dev, int index); @@ -146,8 +162,14 @@ extern void cpuidle_use_deepest_state(bool enable); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } +static inline int cpuidle_find_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + unsigned int sleep_time, + unsigned int latency_req) +{return -ENODEV; } static inline int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + int latency_req, s64 next_timer_event) {return -ENODEV; } static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) @@ -202,7 +224,8 @@ struct cpuidle_governor { struct cpuidle_device *dev); int (*select) (struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + int latency_req, s64 next_timer_event); void (*reflect) (struct cpuidle_device *dev, int index); struct module *owner; @@ -215,10 +238,4 @@ static inline int cpuidle_register_governor(struct cpuidle_governor *gov) {return 0;} #endif -#ifdef CONFIG_ARCH_HAS_CPU_RELAX -#define CPUIDLE_DRIVER_STATE_START 1 -#else -#define CPUIDLE_DRIVER_STATE_START 0 -#endif - #endif /* _LINUX_CPUIDLE_H */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index d9b05b5bf8c7..bb7dddc33918 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -59,6 +59,7 @@ * resume time. */ #define IRQF_DISABLED 0x00000020 +#define IRQF_TIMINGS 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 diff --git a/include/linux/irq.h b/include/linux/irq.h index d09ec7a1243e..8150c1cfc7f4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -862,4 +862,10 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +#ifdef CONFIG_IRQ_TIMINGS +extern s64 irqt_get_next_prediction(int cpu); +#else +static inline s64 irqt_get_next_prediction(int cpu) { return 0; } +#endif + #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index faf433af425e..3d723a946071 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -14,11 +14,13 @@ struct module; struct irq_desc; struct irq_domain; struct pt_regs; +struct irqt_stat; /** * struct irq_desc - interrupt descriptor * @irq_data: per irq and chip data passed down to chip functions * @kstat_irqs: irq stats per cpu + * @irq_timings: irq occurrence timing statistics * @handle_irq: highlevel irq-events handler * @preflow_handler: handler called before the flow handler (currently used by sparc) * @action: the irq action chain @@ -49,6 +51,9 @@ struct pt_regs; struct irq_desc { struct irq_data irq_data; unsigned int __percpu *kstat_irqs; +#ifdef CONFIG_IRQ_TIMINGS + struct irqt_stat *irq_timings; +#endif irq_flow_handler_t handle_irq; #ifdef CONFIG_IRQ_PREFLOW_FASTEOI irq_preflow_handler_t preflow_handler; diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 3608bebd3d9c..53910cf3917d 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -84,6 +84,51 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +#ifdef CONFIG_IRQ_TIMINGS +/** + * irq_timings - provide updated IRQ timing statistics + * @irq: irq number + * @interval: time interval since last irq + * @variance: time interval variance + * @mean: mean interval + * @good: current count of predictable irqs + * @bad: current count of unpredictable irqs + * + * Note: variance is provided/listed before the mean value to help with + * alignment constraints on 64-bit values. + */ +TRACE_EVENT(irq_timings, + + TP_PROTO(int irq, u32 interval, u64 variance, u32 mean, + u32 good, u32 bad), + + TP_ARGS(irq, interval, variance, mean, good, bad), + + TP_STRUCT__entry( + __field( int, irq ) + __field( u32, interval ) + __field( u64, variance ) + __field( u32, mean ) + __field( u32, good ) + __field( u32, bad ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->interval = interval; + __entry->variance = variance; + __entry->mean = mean; + __entry->good = good; + __entry->bad = bad; + ), + + TP_printk("irq=%d intv=%u mean=%u variance=%llu (%u vs %u)", + __entry->irq, __entry->interval, __entry->mean, + (unsigned long long)__entry->variance, + __entry->good, __entry->bad) +); +#endif + DECLARE_EVENT_CLASS(softirq, TP_PROTO(unsigned int vec_nr), diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9a76e3beda54..3a134b685552 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,4 +100,9 @@ config SPARSE_IRQ If you don't know what to do here, say N. +# Support for IRQ timing stats and prediction, mainly for cpuidle usage +config IRQ_TIMINGS + bool + default y + endmenu diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index d12123526e2b..fad4b6d2768a 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_IRQ_TIMINGS) += timings.o diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index df553b0af936..a0e7522b2b4f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -110,6 +110,23 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +#ifdef CONFIG_IRQ_TIMINGS +extern void __init irqt_init(void); +extern void irqt_process(unsigned int irq, struct irqt_stat *s); +static inline void irqt_event(int irq, struct irq_desc *desc) +{ + if (desc->irq_timings) + irqt_process(irq, desc->irq_timings); +} +extern int irqt_register(struct irq_desc *desc); +extern void irqt_unregister(struct irq_desc *desc); +#else +static inline void irqt_init(void) { } +static inline void irqt_event(int irq, struct irq_desc *desc) { } +static inline int irqt_register(struct irq_desc *desc) { return 0; } +static inline void irqt_unregister(struct irq_desc *desc) { } +#endif + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); @@ -197,6 +214,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); + irqt_event(irq, desc); } #ifdef CONFIG_PM_SLEEP diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 99793b9b6d23..f31471ebce36 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -232,6 +232,7 @@ int __init early_irq_init(void) int i, initcnt, node = first_online_node; struct irq_desc *desc; + irqt_init(); init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ @@ -270,6 +271,7 @@ int __init early_irq_init(void) int count, i, node = first_online_node; struct irq_desc *desc; + irqt_init(); init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 80692373abd6..88b487b355ae 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1231,6 +1231,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) register_handler_proc(irq, new); free_cpumask_var(mask); + if (new->flags & IRQF_TIMINGS) + irqt_register(desc); + return 0; mismatch: @@ -1328,6 +1331,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_shutdown(desc); irq_release_resources(desc); + irqt_unregister(desc); } #ifdef CONFIG_SMP diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9dc9bfd8a678..4cda809b8512 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -10,8 +10,10 @@ #include <linux/gfp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/uaccess.h> #include "internals.h" @@ -283,6 +285,62 @@ static const struct file_operations irq_spurious_proc_fops = { .release = single_release, }; +static int irq_timings_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "%d\n", desc->irq_timings ? 1 : 0); + + return 0; +} + +static int irq_timings_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_timings_proc_show, PDE_DATA(inode)); +} + +static ssize_t irq_timings_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + long enable; + int ret; + int irq = (int)(long)PDE_DATA(file_inode(file)); + struct irq_desc *desc = irq_to_desc(irq); + char *buf; + + buf = kzalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(buf, buffer, count)) + goto out; + + ret = kstrtoul(buf, 0, &enable); + if (ret < 0) + goto out; + + if (enable) { + ret = irqt_register(desc); + } else { + unsigned long flags; + raw_spin_lock_irqsave(&desc->lock, flags); + irqt_unregister(desc); + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +out: + kfree(buf); + return ret ? ret : count; +} + +static const struct file_operations irq_timings_proc_fops = { + .open = irq_timings_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_timings_proc_write, +}; + #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -358,6 +416,11 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); +#ifdef CONFIG_IRQ_TIMINGS + /* create /proc/irq/<irq>/timings */ + proc_create_data("timings", 0644, desc->dir, + &irq_timings_proc_fops, (void *)(long)irq); +#endif } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -373,7 +436,9 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) remove_proc_entry("node", desc->dir); #endif remove_proc_entry("spurious", desc->dir); - +#ifdef CONFIG_IRQ_TIMING + remove_proc_entry("timing", desc->dir); +#endif memset(name, 0, MAX_NAMELEN); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..27994cea4a99 --- /dev/null +++ b/kernel/irq/timings.c @@ -0,0 +1,338 @@ +/* + * IRQ occurrence timing statistics + * + * Created by: Nicolas Pitre, November 2014 + * Copyright: (C) 2014-2015 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/irq.h> +#include <linux/ktime.h> +#include <linux/list.h> +#include <linux/math64.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include "internals.h" + +#include <trace/events/irq.h> + + +/* + * This is the size of the IRQ interval window used to compute the + * mean interval and its variance. This has to be at least 3 to still + * make sense. Higher values may improve prediction confidence but more + * false negatives are to be expected. + */ +#define IRQT_INTERVAL_WINDOW 3 + + +struct irqt_prediction { + struct list_head node; + ktime_t time; /* expected occurrence time */ + int cpu; /* CPU for which this was queued for */ +}; + +struct irqt_stat { + ktime_t last_time; /* previous IRQ occurrence */ + u64 n_M2; /* IRQ interval variance (n scaled) */ + u32 n_mean; /* IRQ mean interval (n scaled) */ + u32 intervals[IRQT_INTERVAL_WINDOW]; + /* window of recent IRQ intervals */ + unsigned int w_ptr; /* current window pointer */ + u32 predictable; /* # of IRQs that were predictable */ + u32 unpredictable; /* # of IRQs that were not */ + struct irqt_prediction prediction; +}; + +static DEFINE_PER_CPU(struct list_head, irqt_predictions); +static DEFINE_PER_CPU(raw_spinlock_t, irqt_predictions_lock); + +void __init irqt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + INIT_LIST_HEAD(&per_cpu(irqt_predictions, cpu)); + raw_spin_lock_init(&per_cpu(irqt_predictions_lock, cpu)); + } +} + +/* + * Purge past events. + * Caller must take care of locking. + */ +static void irqt_purge(ktime_t now, struct list_head *head) +{ + struct irqt_prediction *entry, *n; + + list_for_each_entry_safe(entry, n, head, node) { + if (ktime_after(entry->time, now)) + break; + list_del_init(&entry->node); + } +} + +/* + * Enqueue the next predicted event for this IRQ on this CPU. + * We are in interrupt context with IRQs disabled. + */ +static void irqt_enqueue_prediction(ktime_t now, struct irqt_stat *s) +{ + int this_cpu = raw_smp_processor_id(); + int prev_cpu = s->prediction.cpu; + struct list_head *head = &per_cpu(irqt_predictions, this_cpu); + u32 predicted_interval = s->n_mean / IRQT_INTERVAL_WINDOW; + struct irqt_prediction *list_entry, *new_entry; + raw_spinlock_t *lock; + + if (unlikely(prev_cpu != this_cpu && prev_cpu != -1)) { + lock = &per_cpu(irqt_predictions_lock, prev_cpu); + raw_spin_lock(lock); + list_del_init(&s->prediction.node); + raw_spin_unlock(lock); + } + + lock = &per_cpu(irqt_predictions_lock, this_cpu); + raw_spin_lock(lock); + irqt_purge(now, head); + __list_del_entry(&s->prediction.node); + new_entry = &s->prediction; + new_entry->time = ktime_add_us(now, predicted_interval); + new_entry->cpu = this_cpu; + list_for_each_entry(list_entry, head, node) + if (ktime_after(new_entry->time, list_entry->time)) + break; + list_add_tail(&new_entry->node, &list_entry->node); + raw_spin_unlock(lock); +} + +/** + * irqt_get_next_prediction - get relative time before next predicted IRQ + * + * @cpu: the CPU number for which a prediction is wanted + * + * This returns the relative time in microsecs before the next expected IRQ + * on given CPU, or zero if no prediction is available. Those predictions + * are not guaranteed to be reliable, and guaranteed to fail from time to + * time i.e. when the predicted IRQ simply never comes, etc. + */ +s64 irqt_get_next_prediction(int cpu) +{ + raw_spinlock_t *lock = &per_cpu(irqt_predictions_lock, cpu); + struct list_head *head = &per_cpu(irqt_predictions, cpu); + unsigned long flags; + ktime_t now; + struct irqt_prediction *next; + s64 result; + + raw_spin_lock_irqsave(lock, flags); + now = ktime_get(); + irqt_purge(now, head); + next = list_first_entry_or_null(head, struct irqt_prediction, node); + result = next ? ktime_us_delta(next->time, now) : 0; + raw_spin_unlock_irqrestore(lock, flags); + return result; +} + +/* + * irqt_process - update timing interval statistics for the given IRQ + * + * @irq: the IRQ number + * @stat: the corresponding IRQ timing stats record + * + * This is assumed to be called in IRQ context with desc->lock held and + * IRQs turned off. + */ +void irqt_process(unsigned int irq, struct irqt_stat *s) +{ + ktime_t now = ktime_get(); + ktime_t ktime_interval = ktime_sub(now, s->last_time); + u32 oldX, newX, n = IRQT_INTERVAL_WINDOW; + s32 delta, n_dold, n_dnew; + + s->last_time = now; + + /* An interval needs at least two events */ + if (unlikely(ktime_equal(now, ktime_interval))) + return; + + /* + * There is no point attempting predictions on interrupts more + * than 1 second apart. This has no benefit for sleep state + * selection and increases the risk of overflowing our variance + * computation. Reset all stats in that case. + */ + if (unlikely(ktime_after(ktime_interval, ktime_set(1, 0)))) { + s->n_mean = 0; + return; + } + + /* microsecs is good enough */ + newX = ktime_to_us(ktime_interval); + + /* Seed the stats with the first interval */ + if (unlikely(!s->n_mean)) { + int i; + s->n_M2 = 0; + s->n_mean = newX * n; + for (i = 0; i < IRQT_INTERVAL_WINDOW; i++) + s->intervals[i] = newX; + s->predictable = s->unpredictable = 0; + return; + } + + /* Replace the oldest interval in our window */ + oldX = s->intervals[s->w_ptr]; + s->intervals[s->w_ptr] = newX; + s->w_ptr = (s->w_ptr + 1) % IRQT_INTERVAL_WINDOW; + + /* + * The variance gives us an instantaneous deviation from the + * mean interval value. Given x a new inter-IRQ interval and n the + * number of such intervals to date: + * + * n = n + 1 + * delta = x - mean + * mean = mean + delta/n + * M2 = M2 + delta*(x - mean) + * + * variance = M2/(n - 1) + * + * We want to update the variance over a window of recent intervals + * in order to stay current with changing IRQ patterns. To remove + * the contribution from a sample x: + * + * n = n - 1 + * delta = x - mean + * mean = mean - delta/n + * M2 = M2 - delta*(x - mean) + * + * Combining those equations, we update both the mean and + * variance by removing the contribution from the oldest window + * sample and adding the latest one at the same time: + * + * delta = newX - oldX + * dold = oldX - mean + * mean = mean + delta/n + * dnew = newX - mean + * M2 = M2 + delta * (dold + dnew) + * + * Ref: + * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + * + * However this is unstable if performed with integer math due to + * the accumulation of bit truncation errors caused by the division. + * To avoid that, let's factor out the division. Assuming + * n_mean = n * mean: + * + * delta = newX - oldX + * n_dold = n * oldX - n_mean + * n_mean = n_mean + delta + * n_dnew = n * newX - n_mean + * n_M2 = n_M2 + delta * (n_dold + n_dnew) + * + * variance = n_M2/n / (n - 1) + * + * To make things as efficient as possible, we keep our window + * size constant: n = IRQT_INTERVAL_WINDOW. + */ + delta = newX - oldX; + n_dold = n*oldX - s->n_mean; + s->n_mean += delta; + n_dnew = n*newX - s->n_mean; + s->n_M2 += (s64)delta * (n_dold + n_dnew); + + /* + * Let's determine if this interrupt actually happened after a + * periodic interval. We treat a standard deviation greater than + * the mean value as a signal that the current interval is no longer + * stable enough to be predictable. + * + * mean < SD --> mean < sqrt(variance) --> mean^2 < variance + * + * n_mean/n * n_mean/n < n_M2/n / (n - 1) --> + * n_mean * n_mean * (n - 1) < n_M2 * n + */ + if ((u64)s->n_mean * s->n_mean * (n - 1) > s->n_M2 * n) { + s->predictable++; + if (s->predictable >= IRQT_INTERVAL_WINDOW) + irqt_enqueue_prediction(now, s); + } else { + s->predictable = 0; + s->unpredictable++; + } + + trace_irq_timings(irq, newX, div_u64(s->n_M2, n*(n-1)), s->n_mean/n, + s->predictable, s->unpredictable); +} + +/* + * Called from __setup_irq() after successful registration of a new action + * handler. + */ +int irqt_register(struct irq_desc *desc) +{ + struct irqt_stat *s; + unsigned long flags; + int ret; + + if (desc->irq_timings) + return 0; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + INIT_LIST_HEAD(&s->prediction.node); + s->prediction.cpu = -1; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (desc->irq_timings) { + /* someone else raced ahead of us */ + ret = 0; + } else if (!desc->action) { + /* unused IRQ? */ + ret = -ENXIO; + } else if (irq_settings_is_per_cpu(desc)) { + /* we're not set for per-CPU accounting */ + pr_warn("IRQ %d: can't do timing stats on per-CPU IRQs\n", + desc->action->irq); + ret = -ENOSYS; + } else { + desc->irq_timings = s; + s = NULL; + ret = 0; + } + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (s) + kfree(s); + return ret; +} + +/* + * Called from __free_irq() when there is no longer any handler attached + * to the IRQ descriptor. Must be called with desc->lock held. + */ +void irqt_unregister(struct irq_desc *desc) +{ + struct irqt_stat *s; + int cpu; + raw_spinlock_t *lock; + + assert_raw_spin_locked(&desc->lock); + if (!desc->irq_timings) + return; + s = desc->irq_timings; + desc->irq_timings = NULL; + cpu = s->prediction.cpu; + if (cpu != -1) { + lock = &per_cpu(irqt_predictions_lock, cpu); + raw_spin_lock(lock); + __list_del_entry(&s->prediction.node); + raw_spin_unlock(lock); + } + kfree(s); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4976ab65b39..ce4b31bc7f28 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4968,21 +4968,45 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); - if (idle && idle->exit_latency < min_exit_latency) { + + if (idle) { + /* - * We give priority to a CPU whose idle state - * has the smallest exit latency irrespective - * of any idle timestamp. + * When we want to save energy, exclude cpu which did not reach + * the break even point in the idle state */ - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { + if (sched_feat(ENERGY_IDLE) && + ((ktime_to_us(ktime_get()) - idle->idle_stamp < + idle->target_residency))) + continue; + + if (idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU + * whose idle state has the + * smallest exit latency + * irrespective of any idle + * timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = idle->idle_stamp; + shallowest_idle_cpu = i; + } else if (idle->exit_latency == min_exit_latency && + idle->idle_stamp > latest_idle_timestamp) { + /* + * If the CPU is in the same + * idle state, choose the more + * recent one as it might have + * a warmer cache + */ + latest_idle_timestamp = idle->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (rq->idle_stamp > latest_idle_timestamp) { /* - * If equal or no active idle state, then - * the most recently idled CPU might have - * a warmer cache. + * If no active idle state, then the + * most recent idled CPU might have a + * warmer cache */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; @@ -4996,7 +5020,15 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } } - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + /* + * If there is a non idle cpu different from the current one, + * let's use it if we want to save energy by not waking up an + * idle cpu, otherwise let's use the shallowest idle cpu + */ + if (sched_feat(ENERGY_IDLE) && least_loaded_cpu != this_cpu) + return least_loaded_cpu; + else + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..b14f8ebcf4b1 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,6 +61,11 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) /* + * Apply energy saving agressive policy when idle + */ +SCHED_FEAT(ENERGY_IDLE, true) + +/* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via * numa_balancing= diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e666..ffef99b2ad03 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -5,6 +5,7 @@ #include <linux/cpu.h> #include <linux/cpuidle.h> #include <linux/tick.h> +#include <linux/pm_qos.h> #include <linux/mm.h> #include <linux/stackprotector.h> @@ -42,18 +43,6 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) -{ - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); - while (!tif_need_resched()) - cpu_relax(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); - return 1; -} - /* Weak implementations for optional arch specific functions */ void __weak arch_cpu_idle_prepare(void) { } void __weak arch_cpu_idle_enter(void) { } @@ -65,6 +54,23 @@ void __weak arch_cpu_idle(void) local_irq_enable(); } +void __weak arch_cpu_idle_poll(void) +{ + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); +} + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + arch_cpu_idle_poll(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + /** * cpuidle_idle_call - the main idle function * @@ -74,7 +80,7 @@ void __weak arch_cpu_idle(void) * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ -static void cpuidle_idle_call(void) +static void cpuidle_idle_call(unsigned int latency_req, s64 next_timer_event) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -107,7 +113,7 @@ static void cpuidle_idle_call(void) * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, latency_req, next_timer_event); if (next_state < 0) { use_default: /* @@ -166,7 +172,8 @@ use_default: /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (entered_state >= 0) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); @@ -188,6 +195,9 @@ exit_idle: */ static void cpu_idle_loop(void) { + unsigned int latency_req; + s64 next_timer_event; + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -211,19 +221,30 @@ static void cpu_idle_loop(void) local_irq_disable(); arch_cpu_idle_enter(); + latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); + + next_timer_event = + ktime_to_us(tick_nohz_get_sleep_length()); + /* * In poll mode we reenable interrupts and spin. * + * If the latency req is zero, we don't want to + * enter any idle state and we jump to the poll + * function directly + * * Also if we detected in the wakeup from idle * path that the tick broadcast device expired * for us, we don't want to go deep idle as we * know that the IPI is going to arrive right * away */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (!latency_req || cpu_idle_force_poll || + tick_check_broadcast_expired()) cpu_idle_poll(); else - cpuidle_idle_call(); + cpuidle_idle_call(latency_req, + next_timer_event); arch_cpu_idle_exit(); } |