aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/arm/pmu.txt3
-rw-r--r--arch/arm/Kconfig85
-rw-r--r--arch/arm/include/asm/perf_event.h5
-rw-r--r--arch/arm/include/asm/pmu.h40
-rw-r--r--arch/arm/include/asm/topology.h34
-rw-r--r--arch/arm/kernel/hw_breakpoint.c57
-rw-r--r--arch/arm/kernel/perf_event.c103
-rw-r--r--arch/arm/kernel/perf_event_cpu.c169
-rw-r--r--arch/arm/kernel/perf_event_v6.c130
-rw-r--r--arch/arm/kernel/perf_event_v7.c295
-rw-r--r--arch/arm/kernel/perf_event_xscale.c161
-rw-r--r--arch/arm/kernel/topology.c125
-rw-r--r--arch/ia64/include/asm/topology.h1
-rw-r--r--arch/tile/include/asm/topology.h1
-rw-r--r--include/linux/sched.h29
-rw-r--r--include/linux/topology.h3
-rw-r--r--include/trace/events/sched.h153
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/debug.c39
-rw-r--r--kernel/sched/fair.c1951
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--linaro/configs/big-LITTLE-MP.conf13
22 files changed, 2924 insertions, 554 deletions
diff --git a/Documentation/devicetree/bindings/arm/pmu.txt b/Documentation/devicetree/bindings/arm/pmu.txt
index 343781b9f24..4ce82d045a6 100644
--- a/Documentation/devicetree/bindings/arm/pmu.txt
+++ b/Documentation/devicetree/bindings/arm/pmu.txt
@@ -16,6 +16,9 @@ Required properties:
"arm,arm1176-pmu"
"arm,arm1136-pmu"
- interrupts : 1 combined interrupt or 1 per core.
+- cluster : a phandle to the cluster to which it belongs
+ If there are more than one cluster with same CPU type
+ then there should be separate PMU nodes per cluster.
Example:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ade7e924bef..f8b7b7f31da 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1556,6 +1556,91 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+ bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+ help
+ Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+ bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+ depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+ help
+ Experimental scheduler optimizations for heterogeneous platforms.
+ Attempts to introspectively select task affinity to optimize power
+ and performance. Basic support for multiple (>2) cpu types is in place,
+ but it has only been tested with two types of cpus.
+ There is currently no support for migration of task groups, hence
+ !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled
+ between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE).
+
+config SCHED_HMP_PRIO_FILTER
+ bool "(EXPERIMENTAL) Filter HMP migrations by task priority"
+ depends on SCHED_HMP
+ default y
+ help
+ Enables task priority based HMP migration filter. Any task with
+ a NICE value above the threshold will always be on low-power cpus
+ with less compute capacity.
+
+config SCHED_HMP_PRIO_FILTER_VAL
+ int "NICE priority threshold"
+ default 5
+ depends on SCHED_HMP_PRIO_FILTER
+
+config HMP_FAST_CPU_MASK
+ string "HMP scheduler fast CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the fast CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+ string "HMP scheduler slow CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the slow CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_VARIABLE_SCALE
+ bool "Allows changing the load tracking scale through sysfs"
+ depends on SCHED_HMP
+ help
+ When turned on, this option exports the thresholds and load average
+ period value for the load tracking patches through sysfs.
+ The values can be modified to change the rate of load accumulation
+ and the thresholds used for HMP migration.
+ The load_avg_period_ms is the time in ms to reach a load average of
+ 0.5 for an idle task of 0 load average ratio that start a busy loop.
+ The up_threshold and down_threshold is the value to go to a faster
+ CPU or to go back to a slower cpu.
+ The {up,down}_threshold are devided by 1024 before being compared
+ to the load average.
+ For examples, with load_avg_period_ms = 128 and up_threshold = 512,
+ a running task with a load of 0 will be migrated to a bigger CPU after
+ 128ms, because after 128ms its load_avg_ratio is 0.5 and the real
+ up_threshold is 0.5.
+ This patch has the same behavior as changing the Y of the load
+ average computation to
+ (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
+ but it remove intermadiate overflows in computation.
+
+config HMP_FREQUENCY_INVARIANT_SCALE
+ bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
+ depends on HMP_VARIABLE_SCALE && CPU_FREQ
+ help
+ Scales the current load contribution in line with the frequency
+ of the CPU that the task was executed on.
+ In this version, we use a simple linear scale derived from the
+ maximum frequency reported by CPUFreq.
+ Restricting tracked load to be scaled by the CPU's frequency
+ represents the consumption of possible compute capacity
+ (rather than consumption of actual instantaneous capacity as
+ normal) and allows the HMP migration's simple threshold
+ migration strategy to interact more predictably with CPUFreq's
+ asynchronous compute capacity changes.
+
config HAVE_ARM_SCU
bool
help
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index 625cd621a43..00416edecea 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -21,4 +21,9 @@
#define C(_x) PERF_COUNT_HW_CACHE_##_x
#define CACHE_OP_UNSUPPORTED 0xFFFF
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
+
#endif /* __ARM_PERF_EVENT_H__ */
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index a26170dce02..0cd7824ca76 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -62,25 +62,37 @@ struct pmu_hw_events {
raw_spinlock_t pmu_lock;
};
+struct cpupmu_regs {
+ u32 pmc;
+ u32 pmcntenset;
+ u32 pmuseren;
+ u32 pmintenset;
+ u32 pmxevttype[8];
+ u32 pmxevtcnt[8];
+};
+
struct arm_pmu {
struct pmu pmu;
cpumask_t active_irqs;
+ cpumask_t valid_cpus;
char *name;
irqreturn_t (*handle_irq)(int irq_num, void *dev);
- void (*enable)(struct hw_perf_event *evt, int idx);
- void (*disable)(struct hw_perf_event *evt, int idx);
+ void (*enable)(struct perf_event *event);
+ void (*disable)(struct perf_event *event);
int (*get_event_idx)(struct pmu_hw_events *hw_events,
- struct hw_perf_event *hwc);
+ struct perf_event *event);
int (*set_event_filter)(struct hw_perf_event *evt,
struct perf_event_attr *attr);
- u32 (*read_counter)(int idx);
- void (*write_counter)(int idx, u32 val);
- void (*start)(void);
- void (*stop)(void);
+ u32 (*read_counter)(struct perf_event *event);
+ void (*write_counter)(struct perf_event *event, u32 val);
+ void (*start)(struct arm_pmu *);
+ void (*stop)(struct arm_pmu *);
void (*reset)(void *);
- int (*request_irq)(irq_handler_t handler);
- void (*free_irq)(void);
+ int (*request_irq)(struct arm_pmu *, irq_handler_t handler);
+ void (*free_irq)(struct arm_pmu *);
int (*map_event)(struct perf_event *event);
+ void (*save_regs)(struct arm_pmu *, struct cpupmu_regs *);
+ void (*restore_regs)(struct arm_pmu *, struct cpupmu_regs *);
int num_events;
atomic_t active_events;
struct mutex reserve_mutex;
@@ -93,15 +105,11 @@ struct arm_pmu {
extern const struct dev_pm_ops armpmu_dev_pm_ops;
-int armpmu_register(struct arm_pmu *armpmu, char *name, int type);
+int armpmu_register(struct arm_pmu *armpmu, int type);
-u64 armpmu_event_update(struct perf_event *event,
- struct hw_perf_event *hwc,
- int idx);
+u64 armpmu_event_update(struct perf_event *event);
-int armpmu_event_set_period(struct perf_event *event,
- struct hw_perf_event *hwc,
- int idx);
+int armpmu_event_set_period(struct perf_event *event);
int armpmu_map_event(struct perf_event *event,
const unsigned (*event_map)[PERF_COUNT_HW_MAX],
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd..983fa7c153a 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -26,11 +26,45 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask);
+
+#ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .idle_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ \
+ .flags = 0*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+#endif
+#endif /* CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE */
#else
static inline void init_cpu_topology(void) { }
static inline void store_cpu_topology(unsigned int cpuid) { }
+static inline int cluster_to_logical_mask(unsigned int socket_id,
+ cpumask_t *cluster_mask) { return -EINVAL; }
#endif
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 281bf330124..eed4d0cdd74 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -28,6 +28,7 @@
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/smp.h>
+#include <linux/cpu_pm.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
@@ -42,6 +43,11 @@ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]);
/* Watchpoint currently in use for each WRP. */
static DEFINE_PER_CPU(struct perf_event *, wp_on_reg[ARM_MAX_WRP]);
+#ifdef CONFIG_CPU_PM
+/* Storage for OS Save and Restore. */
+static DEFINE_PER_CPU(u32, cpu_dscr);
+#endif
+
/* Number of BRP/WRP registers on this CPU. */
static int core_num_brps;
static int core_num_wrps;
@@ -990,6 +996,55 @@ static struct notifier_block __cpuinitdata dbg_reset_nb = {
.notifier_call = dbg_reset_notify,
};
+#ifdef CONFIG_CPU_PM
+static void os_save(int cpu)
+{
+ /* Set OS Lock. */
+ asm volatile("mcr p14, 0, %0, c1, c0, 4" : : "r" (0xC5ACCE55));
+ isb();
+
+ /* Save DSCRext. */
+ ARM_DBG_READ(c2, 2, per_cpu(cpu_dscr, cpu));
+}
+
+static void os_restore(int cpu)
+{
+ /* Restore DSCRext. */
+ ARM_DBG_WRITE(c2, 2, per_cpu(cpu_dscr, cpu));
+
+ /* Clear OS Lock. */
+ asm volatile("mcr p14, 0, %0, c1, c0, 4" : : "r" (0));
+ isb();
+}
+
+static int dbg_cpu_pm_notify(struct notifier_block *self, unsigned long action,
+ void *v)
+{
+ int cpu = smp_processor_id();
+
+ if (action == CPU_PM_ENTER)
+ os_save(cpu);
+ else if (action == CPU_PM_EXIT)
+ os_restore(cpu);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata dbg_cpu_pm_nb = {
+ .notifier_call = dbg_cpu_pm_notify,
+};
+
+static void __init pm_init(void)
+{
+ if (get_debug_arch() == ARM_DEBUG_ARCH_V7_1)
+ cpu_pm_register_notifier(&dbg_cpu_pm_nb);
+}
+#else
+static inline void pm_init(void)
+{
+}
+#endif
+
static int __init arch_hw_breakpoint_init(void)
{
u32 dscr;
@@ -1048,6 +1103,8 @@ static int __init arch_hw_breakpoint_init(void)
/* Register hotplug notifier. */
register_cpu_notifier(&dbg_reset_nb);
+
+ pm_init();
return 0;
}
arch_initcall(arch_hw_breakpoint_init);
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 53c0304b734..72c4fbe63d1 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -12,6 +12,7 @@
*/
#define pr_fmt(fmt) "hw perfevents: " fmt
+#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
@@ -81,17 +82,18 @@ armpmu_map_event(struct perf_event *event,
return armpmu_map_cache_event(cache_map, config);
case PERF_TYPE_RAW:
return armpmu_map_raw_event(raw_event_mask, config);
+ default:
+ if (event->attr.type >= PERF_TYPE_MAX)
+ return armpmu_map_raw_event(raw_event_mask, config);
}
return -ENOENT;
}
-int
-armpmu_event_set_period(struct perf_event *event,
- struct hw_perf_event *hwc,
- int idx)
+int armpmu_event_set_period(struct perf_event *event)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int ret = 0;
@@ -119,24 +121,22 @@ armpmu_event_set_period(struct perf_event *event,
local64_set(&hwc->prev_count, (u64)-left);
- armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
+ armpmu->write_counter(event, (u64)(-left) & 0xffffffff);
perf_event_update_userpage(event);
return ret;
}
-u64
-armpmu_event_update(struct perf_event *event,
- struct hw_perf_event *hwc,
- int idx)
+u64 armpmu_event_update(struct perf_event *event)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
u64 delta, prev_raw_count, new_raw_count;
again:
prev_raw_count = local64_read(&hwc->prev_count);
- new_raw_count = armpmu->read_counter(idx);
+ new_raw_count = armpmu->read_counter(event);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
@@ -159,7 +159,7 @@ armpmu_read(struct perf_event *event)
if (hwc->idx < 0)
return;
- armpmu_event_update(event, hwc, hwc->idx);
+ armpmu_event_update(event);
}
static void
@@ -168,23 +168,26 @@ armpmu_stop(struct perf_event *event, int flags)
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
/*
* ARM pmu always has to update the counter, so ignore
* PERF_EF_UPDATE, see comments in armpmu_start().
*/
if (!(hwc->state & PERF_HES_STOPPED)) {
- armpmu->disable(hwc, hwc->idx);
- armpmu_event_update(event, hwc, hwc->idx);
+ armpmu->disable(event);
+ armpmu_event_update(event);
hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
}
}
-static void
-armpmu_start(struct perf_event *event, int flags)
+static void armpmu_start(struct perf_event *event, int flags)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
/*
* ARM pmu always has to reprogram the period, so ignore
* PERF_EF_RELOAD, see the comment below.
@@ -200,8 +203,8 @@ armpmu_start(struct perf_event *event, int flags)
* get an interrupt too soon or *way* too late if the overflow has
* happened since disabling.
*/
- armpmu_event_set_period(event, hwc, hwc->idx);
- armpmu->enable(hwc, hwc->idx);
+ armpmu_event_set_period(event);
+ armpmu->enable(event);
}
static void
@@ -212,6 +215,9 @@ armpmu_del(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
+
WARN_ON(idx < 0);
armpmu_stop(event, PERF_EF_UPDATE);
@@ -230,10 +236,14 @@ armpmu_add(struct perf_event *event, int flags)
int idx;
int err = 0;
+ /* An event following a process won't be stopped earlier */
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return 0;
+
perf_pmu_disable(event->pmu);
/* If we don't have a space for the counter then finish early. */
- idx = armpmu->get_event_idx(hw_events, hwc);
+ idx = armpmu->get_event_idx(hw_events, event);
if (idx < 0) {
err = idx;
goto out;
@@ -244,7 +254,7 @@ armpmu_add(struct perf_event *event, int flags)
* sure it is disabled.
*/
event->hw.idx = idx;
- armpmu->disable(hwc, idx);
+ armpmu->disable(event);
hw_events->events[idx] = event;
hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
@@ -264,13 +274,12 @@ validate_event(struct pmu_hw_events *hw_events,
struct perf_event *event)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
- struct hw_perf_event fake_event = event->hw;
struct pmu *leader_pmu = event->group_leader->pmu;
if (event->pmu != leader_pmu || event->state <= PERF_EVENT_STATE_OFF)
return 1;
- return armpmu->get_event_idx(hw_events, &fake_event) >= 0;
+ return armpmu->get_event_idx(hw_events, event) >= 0;
}
static int
@@ -316,7 +325,7 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
static void
armpmu_release_hardware(struct arm_pmu *armpmu)
{
- armpmu->free_irq();
+ armpmu->free_irq(armpmu);
pm_runtime_put_sync(&armpmu->plat_device->dev);
}
@@ -330,7 +339,7 @@ armpmu_reserve_hardware(struct arm_pmu *armpmu)
return -ENODEV;
pm_runtime_get_sync(&pmu_device->dev);
- err = armpmu->request_irq(armpmu_dispatch_irq);
+ err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
if (err) {
armpmu_release_hardware(armpmu);
return err;
@@ -429,6 +438,10 @@ static int armpmu_event_init(struct perf_event *event)
int err = 0;
atomic_t *active_events = &armpmu->active_events;
+ if (event->cpu != -1 &&
+ !cpumask_test_cpu(event->cpu, &armpmu->valid_cpus))
+ return -ENOENT;
+
/* does not support taken branch sampling */
if (has_branch_stack(event))
return -EOPNOTSUPP;
@@ -465,13 +478,13 @@ static void armpmu_enable(struct pmu *pmu)
int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
if (enabled)
- armpmu->start();
+ armpmu->start(armpmu);
}
static void armpmu_disable(struct pmu *pmu)
{
struct arm_pmu *armpmu = to_arm_pmu(pmu);
- armpmu->stop();
+ armpmu->stop(armpmu);
}
#ifdef CONFIG_PM_RUNTIME
@@ -517,12 +530,12 @@ static void __init armpmu_init(struct arm_pmu *armpmu)
};
}
-int armpmu_register(struct arm_pmu *armpmu, char *name, int type)
+int armpmu_register(struct arm_pmu *armpmu, int type)
{
armpmu_init(armpmu);
pr_info("enabled with %s PMU driver, %d counters available\n",
armpmu->name, armpmu->num_events);
- return perf_pmu_register(&armpmu->pmu, name, type);
+ return perf_pmu_register(&armpmu->pmu, armpmu->name, type);
}
/*
@@ -576,6 +589,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct frame_tail __user *tail;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* We don't support guest os callchain now */
+ return;
+ }
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
@@ -603,9 +620,41 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct stackframe fr;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* We don't support guest os callchain now */
+ return;
+ }
+
fr.fp = regs->ARM_fp;
fr.sp = regs->ARM_sp;
fr.lr = regs->ARM_lr;
fr.pc = regs->ARM_pc;
walk_stackframe(&fr, callchain_trace, entry);
}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+ return perf_guest_cbs->get_guest_ip();
+
+ return instruction_pointer(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ int misc = 0;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_cbs->is_user_mode())
+ misc |= PERF_RECORD_MISC_GUEST_USER;
+ else
+ misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ } else {
+ if (user_mode(regs))
+ misc |= PERF_RECORD_MISC_USER;
+ else
+ misc |= PERF_RECORD_MISC_KERNEL;
+ }
+
+ return misc;
+}
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 8d7d8d4de9d..66248ee706c 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -19,10 +19,12 @@
#define pr_fmt(fmt) "CPU PMU: " fmt
#include <linux/bitmap.h>
+#include <linux/cpu_pm.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/platform_device.h>
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <asm/cputype.h>
@@ -30,33 +32,36 @@
#include <asm/pmu.h>
/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *cpu_pmu;
+static DEFINE_PER_CPU(struct arm_pmu *, cpu_pmu);
static DEFINE_PER_CPU(struct perf_event * [ARMPMU_MAX_HWEVENTS], hw_events);
static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)], used_mask);
static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events);
+static DEFINE_PER_CPU(struct cpupmu_regs, cpu_pmu_regs);
+
/*
* Despite the names, these two functions are CPU-specific and are used
* by the OProfile/perf code.
*/
const char *perf_pmu_name(void)
{
- if (!cpu_pmu)
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, 0);
+ if (!pmu)
return NULL;
- return cpu_pmu->pmu.name;
+ return pmu->name;
}
EXPORT_SYMBOL_GPL(perf_pmu_name);
int perf_num_counters(void)
{
- int max_events = 0;
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, 0);
- if (cpu_pmu != NULL)
- max_events = cpu_pmu->num_events;
+ if (!pmu)
+ return 0;
- return max_events;
+ return pmu->num_events;
}
EXPORT_SYMBOL_GPL(perf_num_counters);
@@ -70,15 +75,17 @@ static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
return &__get_cpu_var(cpu_hw_events);
}
-static void cpu_pmu_free_irq(void)
+static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
{
int i, irq, irqs;
struct platform_device *pmu_device = cpu_pmu->plat_device;
+ int cpu = -1;
irqs = min(pmu_device->num_resources, num_possible_cpus());
for (i = 0; i < irqs; ++i) {
- if (!cpumask_test_and_clear_cpu(i, &cpu_pmu->active_irqs))
+ cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus);
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
continue;
irq = platform_get_irq(pmu_device, i);
if (irq >= 0)
@@ -86,10 +93,11 @@ static void cpu_pmu_free_irq(void)
}
}
-static int cpu_pmu_request_irq(irq_handler_t handler)
+static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
{
int i, err, irq, irqs;
struct platform_device *pmu_device = cpu_pmu->plat_device;
+ int cpu = -1;
if (!pmu_device)
return -ENODEV;
@@ -102,6 +110,7 @@ static int cpu_pmu_request_irq(irq_handler_t handler)
for (i = 0; i < irqs; ++i) {
err = 0;
+ cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus);
irq = platform_get_irq(pmu_device, i);
if (irq < 0)
continue;
@@ -111,7 +120,7 @@ static int cpu_pmu_request_irq(irq_handler_t handler)
* assume that we're running on a uniprocessor machine and
* continue. Otherwise, continue without this interrupt.
*/
- if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) {
+ if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
irq, i);
continue;
@@ -125,7 +134,7 @@ static int cpu_pmu_request_irq(irq_handler_t handler)
return err;
}
- cpumask_set_cpu(i, &cpu_pmu->active_irqs);
+ cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
}
return 0;
@@ -134,7 +143,7 @@ static int cpu_pmu_request_irq(irq_handler_t handler)
static void __devinit cpu_pmu_init(struct arm_pmu *cpu_pmu)
{
int cpu;
- for_each_possible_cpu(cpu) {
+ for_each_cpu_mask(cpu, cpu_pmu->valid_cpus) {
struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu);
events->events = per_cpu(hw_events, cpu);
events->used_mask = per_cpu(used_mask, cpu);
@@ -147,7 +156,7 @@ static void __devinit cpu_pmu_init(struct arm_pmu *cpu_pmu)
/* Ensure the PMU has sane values out of reset. */
if (cpu_pmu && cpu_pmu->reset)
- on_each_cpu(cpu_pmu->reset, NULL, 1);
+ on_each_cpu_mask(&cpu_pmu->valid_cpus, cpu_pmu->reset, cpu_pmu, 1);
}
/*
@@ -159,11 +168,34 @@ static void __devinit cpu_pmu_init(struct arm_pmu *cpu_pmu)
static int __cpuinit cpu_pmu_notify(struct notifier_block *b,
unsigned long action, void *hcpu)
{
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, (long)hcpu);
+
if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
return NOTIFY_DONE;
- if (cpu_pmu && cpu_pmu->reset)
- cpu_pmu->reset(NULL);
+ if (pmu && pmu->reset)
+ pmu->reset(pmu);
+ else
+ return NOTIFY_DONE;
+
+ return NOTIFY_OK;
+}
+
+static int cpu_pmu_pm_notify(struct notifier_block *b,
+ unsigned long action, void *hcpu)
+{
+ int cpu = smp_processor_id();
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, cpu);
+ struct cpupmu_regs *pmuregs = &per_cpu(cpu_pmu_regs, cpu);
+
+ if (!pmu)
+ return NOTIFY_DONE;
+
+ if (action == CPU_PM_ENTER && pmu->save_regs) {
+ pmu->save_regs(pmu, pmuregs);
+ } else if (action == CPU_PM_EXIT && pmu->restore_regs) {
+ pmu->restore_regs(pmu, pmuregs);
+ }
return NOTIFY_OK;
}
@@ -172,6 +204,10 @@ static struct notifier_block __cpuinitdata cpu_pmu_hotplug_notifier = {
.notifier_call = cpu_pmu_notify,
};
+static struct notifier_block __cpuinitdata cpu_pmu_pm_notifier = {
+ .notifier_call = cpu_pmu_pm_notify,
+};
+
/*
* PMU platform driver and devicetree bindings.
*/
@@ -195,13 +231,13 @@ static struct platform_device_id __devinitdata cpu_pmu_plat_device_ids[] = {
/*
* CPU PMU identification and probing.
*/
-static struct arm_pmu *__devinit probe_current_pmu(void)
+static int __devinit probe_current_pmu(struct arm_pmu *pmu)
{
- struct arm_pmu *pmu = NULL;
int cpu = get_cpu();
unsigned long cpuid = read_cpuid_id();
unsigned long implementor = (cpuid & 0xFF000000) >> 24;
unsigned long part_number = (cpuid & 0xFFF0);
+ int ret = -ENODEV;
pr_info("probing PMU on CPU %d\n", cpu);
@@ -211,25 +247,25 @@ static struct arm_pmu *__devinit probe_current_pmu(void)
case 0xB360: /* ARM1136 */
case 0xB560: /* ARM1156 */
case 0xB760: /* ARM1176 */
- pmu = armv6pmu_init();
+ ret = armv6pmu_init(pmu);
break;
case 0xB020: /* ARM11mpcore */
- pmu = armv6mpcore_pmu_init();
+ ret = armv6mpcore_pmu_init(pmu);
break;
case 0xC080: /* Cortex-A8 */
- pmu = armv7_a8_pmu_init();
+ ret = armv7_a8_pmu_init(pmu);
break;
case 0xC090: /* Cortex-A9 */
- pmu = armv7_a9_pmu_init();
+ ret = armv7_a9_pmu_init(pmu);
break;
case 0xC050: /* Cortex-A5 */
- pmu = armv7_a5_pmu_init();
+ ret = armv7_a5_pmu_init(pmu);
break;
case 0xC0F0: /* Cortex-A15 */
- pmu = armv7_a15_pmu_init();
+ ret = armv7_a15_pmu_init(pmu);
break;
case 0xC070: /* Cortex-A7 */
- pmu = armv7_a7_pmu_init();
+ ret = armv7_a7_pmu_init(pmu);
break;
}
/* Intel CPUs [xscale]. */
@@ -237,43 +273,74 @@ static struct arm_pmu *__devinit probe_current_pmu(void)
part_number = (cpuid >> 13) & 0x7;
switch (part_number) {
case 1:
- pmu = xscale1pmu_init();
+ ret = xscale1pmu_init(pmu);
break;
case 2:
- pmu = xscale2pmu_init();
+ ret = xscale2pmu_init(pmu);
break;
}
}
+ /* assume PMU support all the CPUs in this case */
+ cpumask_setall(&pmu->valid_cpus);
+
put_cpu();
- return pmu;
+ return ret;
}
static int __devinit cpu_pmu_device_probe(struct platform_device *pdev)
{
const struct of_device_id *of_id;
- struct arm_pmu *(*init_fn)(void);
struct device_node *node = pdev->dev.of_node;
+ struct arm_pmu *pmu;
+ int ret = 0;
+ int cpu;
- if (cpu_pmu) {
- pr_info("attempt to register multiple PMU devices!");
- return -ENOSPC;
+ pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
+ if (!pmu) {
+ pr_info("failed to allocate PMU device!");
+ return -ENOMEM;
}
if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) {
- init_fn = of_id->data;
- cpu_pmu = init_fn();
+ smp_call_func_t init_fn = (smp_call_func_t)of_id->data;
+ struct device_node *ncluster;
+ int cluster = -1;
+ cpumask_t sibling_mask;
+
+ ncluster = of_parse_phandle(node, "cluster", 0);
+ if (ncluster) {
+ int len;
+ const u32 *hwid;
+ hwid = of_get_property(ncluster, "reg", &len);
+ if (hwid && len == 4)
+ cluster = be32_to_cpup(hwid);
+ }
+ /* set sibling mask to all cpu mask if socket is not specified */
+ if (cluster == -1 ||
+ cluster_to_logical_mask(cluster, &sibling_mask))
+ cpumask_setall(&sibling_mask);
+
+ smp_call_function_any(&sibling_mask, init_fn, pmu, 1);
+
+ /* now set the valid_cpus after init */
+ cpumask_copy(&pmu->valid_cpus, &sibling_mask);
} else {
- cpu_pmu = probe_current_pmu();
+ ret = probe_current_pmu(pmu);
}
- if (!cpu_pmu)
- return -ENODEV;
+ if (ret) {
+ pr_info("failed to register PMU devices!");
+ kfree(pmu);
+ return ret;
+ }
- cpu_pmu->plat_device = pdev;
- cpu_pmu_init(cpu_pmu);
- register_cpu_notifier(&cpu_pmu_hotplug_notifier);
- armpmu_register(cpu_pmu, cpu_pmu->name, PERF_TYPE_RAW);
+ for_each_cpu_mask(cpu, pmu->valid_cpus)
+ per_cpu(cpu_pmu, cpu) = pmu;
+
+ pmu->plat_device = pdev;
+ cpu_pmu_init(pmu);
+ armpmu_register(pmu, -1);
return 0;
}
@@ -290,6 +357,24 @@ static struct platform_driver cpu_pmu_driver = {
static int __init register_pmu_driver(void)
{
- return platform_driver_register(&cpu_pmu_driver);
+ int err;
+
+ err = register_cpu_notifier(&cpu_pmu_hotplug_notifier);
+ if (err)
+ return err;
+
+ err = cpu_pm_register_notifier(&cpu_pmu_pm_notifier);
+ if (err) {
+ unregister_cpu_notifier(&cpu_pmu_hotplug_notifier);
+ return err;
+ }
+
+ err = platform_driver_register(&cpu_pmu_driver);
+ if (err) {
+ cpu_pm_unregister_notifier(&cpu_pmu_pm_notifier);
+ unregister_cpu_notifier(&cpu_pmu_hotplug_notifier);
+ }
+
+ return err;
}
device_initcall(register_pmu_driver);
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index 6ccc0797174..b4b0c084511 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -401,9 +401,10 @@ armv6_pmcr_counter_has_overflowed(unsigned long pmcr,
return ret;
}
-static inline u32
-armv6pmu_read_counter(int counter)
+static inline u32 armv6pmu_read_counter(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
unsigned long value = 0;
if (ARMV6_CYCLE_COUNTER == counter)
@@ -418,10 +419,11 @@ armv6pmu_read_counter(int counter)
return value;
}
-static inline void
-armv6pmu_write_counter(int counter,
- u32 value)
+static inline void armv6pmu_write_counter(struct perf_event *event, u32 value)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
+
if (ARMV6_CYCLE_COUNTER == counter)
asm volatile("mcr p15, 0, %0, c15, c12, 1" : : "r"(value));
else if (ARMV6_COUNTER0 == counter)
@@ -432,12 +434,13 @@ armv6pmu_write_counter(int counter,
WARN_ONCE(1, "invalid counter number (%d)\n", counter);
}
-static void
-armv6pmu_enable_event(struct hw_perf_event *hwc,
- int idx)
+static void armv6pmu_enable_event(struct perf_event *event)
{
unsigned long val, mask, evt, flags;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
if (ARMV6_CYCLE_COUNTER == idx) {
mask = 0;
@@ -473,7 +476,8 @@ armv6pmu_handle_irq(int irq_num,
{
unsigned long pmcr = armv6_pmcr_read();
struct perf_sample_data data;
- struct pmu_hw_events *cpuc;
+ struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+ struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
struct pt_regs *regs;
int idx;
@@ -489,7 +493,6 @@ armv6pmu_handle_irq(int irq_num,
*/
armv6_pmcr_write(pmcr);
- cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -506,13 +509,13 @@ armv6pmu_handle_irq(int irq_num,
continue;
hwc = &event->hw;
- armpmu_event_update(event, hwc, idx);
+ armpmu_event_update(event);
perf_sample_data_init(&data, 0, hwc->last_period);
- if (!armpmu_event_set_period(event, hwc, idx))
+ if (!armpmu_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
- cpu_pmu->disable(hwc, idx);
+ cpu_pmu->disable(event);
}
/*
@@ -527,8 +530,7 @@ armv6pmu_handle_irq(int irq_num,
return IRQ_HANDLED;
}
-static void
-armv6pmu_start(void)
+static void armv6pmu_start(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -540,8 +542,7 @@ armv6pmu_start(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-armv6pmu_stop(void)
+static void armv6pmu_stop(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -555,10 +556,11 @@ armv6pmu_stop(void)
static int
armv6pmu_get_event_idx(struct pmu_hw_events *cpuc,
- struct hw_perf_event *event)
+ struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
/* Always place a cycle counter into the cycle counter. */
- if (ARMV6_PERFCTR_CPU_CYCLES == event->config_base) {
+ if (ARMV6_PERFCTR_CPU_CYCLES == hwc->config_base) {
if (test_and_set_bit(ARMV6_CYCLE_COUNTER, cpuc->used_mask))
return -EAGAIN;
@@ -579,12 +581,13 @@ armv6pmu_get_event_idx(struct pmu_hw_events *cpuc,
}
}
-static void
-armv6pmu_disable_event(struct hw_perf_event *hwc,
- int idx)
+static void armv6pmu_disable_event(struct perf_event *event)
{
unsigned long val, mask, evt, flags;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
if (ARMV6_CYCLE_COUNTER == idx) {
mask = ARMV6_PMCR_CCOUNT_IEN;
@@ -613,12 +616,13 @@ armv6pmu_disable_event(struct hw_perf_event *hwc,
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-armv6mpcore_pmu_disable_event(struct hw_perf_event *hwc,
- int idx)
+static void armv6mpcore_pmu_disable_event(struct perf_event *event)
{
unsigned long val, mask, flags, evt = 0;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
if (ARMV6_CYCLE_COUNTER == idx) {
mask = ARMV6_PMCR_CCOUNT_IEN;
@@ -649,24 +653,24 @@ static int armv6_map_event(struct perf_event *event)
&armv6_perf_cache_map, 0xFF);
}
-static struct arm_pmu armv6pmu = {
- .name = "v6",
- .handle_irq = armv6pmu_handle_irq,
- .enable = armv6pmu_enable_event,
- .disable = armv6pmu_disable_event,
- .read_counter = armv6pmu_read_counter,
- .write_counter = armv6pmu_write_counter,
- .get_event_idx = armv6pmu_get_event_idx,
- .start = armv6pmu_start,
- .stop = armv6pmu_stop,
- .map_event = armv6_map_event,
- .num_events = 3,
- .max_period = (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit armv6pmu_init(void)
+static int __devinit armv6pmu_init(struct arm_pmu *cpu_pmu)
{
- return &armv6pmu;
+ *cpu_pmu = (struct arm_pmu) {
+ .name = "v6",
+ .handle_irq = armv6pmu_handle_irq,
+ .enable = armv6pmu_enable_event,
+ .disable = armv6pmu_disable_event,
+ .read_counter = armv6pmu_read_counter,
+ .write_counter = armv6pmu_write_counter,
+ .get_event_idx = armv6pmu_get_event_idx,
+ .start = armv6pmu_start,
+ .stop = armv6pmu_stop,
+ .map_event = armv6_map_event,
+ .num_events = 3,
+ .max_period = (1LLU << 32) - 1,
+ };
+
+ return 0;
}
/*
@@ -683,33 +687,33 @@ static int armv6mpcore_map_event(struct perf_event *event)
&armv6mpcore_perf_cache_map, 0xFF);
}
-static struct arm_pmu armv6mpcore_pmu = {
- .name = "v6mpcore",
- .handle_irq = armv6pmu_handle_irq,
- .enable = armv6pmu_enable_event,
- .disable = armv6mpcore_pmu_disable_event,
- .read_counter = armv6pmu_read_counter,
- .write_counter = armv6pmu_write_counter,
- .get_event_idx = armv6pmu_get_event_idx,
- .start = armv6pmu_start,
- .stop = armv6pmu_stop,
- .map_event = armv6mpcore_map_event,
- .num_events = 3,
- .max_period = (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit armv6mpcore_pmu_init(void)
+static int __devinit armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
{
- return &armv6mpcore_pmu;
+ *cpu_pmu = (struct arm_pmu) {
+ .name = "v6mpcore",
+ .handle_irq = armv6pmu_handle_irq,
+ .enable = armv6pmu_enable_event,
+ .disable = armv6mpcore_pmu_disable_event,
+ .read_counter = armv6pmu_read_counter,
+ .write_counter = armv6pmu_write_counter,
+ .get_event_idx = armv6pmu_get_event_idx,
+ .start = armv6pmu_start,
+ .stop = armv6pmu_stop,
+ .map_event = armv6mpcore_map_event,
+ .num_events = 3,
+ .max_period = (1LLU << 32) - 1,
+ };
+
+ return 0;
}
#else
-static struct arm_pmu *__devinit armv6pmu_init(void)
+static int armv6pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit armv6mpcore_pmu_init(void)
+static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
#endif /* CONFIG_CPU_V6 || CONFIG_CPU_V6K */
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index bd4b090ebcf..3565d8084d6 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -18,8 +18,6 @@
#ifdef CONFIG_CPU_V7
-static struct arm_pmu armv7pmu;
-
/*
* Common ARMv7 event types
*
@@ -738,7 +736,8 @@ static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
*/
#define ARMV7_IDX_CYCLE_COUNTER 0
#define ARMV7_IDX_COUNTER0 1
-#define ARMV7_IDX_COUNTER_LAST (ARMV7_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1)
+#define ARMV7_IDX_COUNTER_LAST(cpu_pmu) \
+ (ARMV7_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1)
#define ARMV7_MAX_COUNTERS 32
#define ARMV7_COUNTER_MASK (ARMV7_MAX_COUNTERS - 1)
@@ -804,49 +803,34 @@ static inline int armv7_pmnc_has_overflowed(u32 pmnc)
return pmnc & ARMV7_OVERFLOWED_MASK;
}
-static inline int armv7_pmnc_counter_valid(int idx)
+static inline int armv7_pmnc_counter_valid(struct arm_pmu *cpu_pmu, int idx)
{
- return idx >= ARMV7_IDX_CYCLE_COUNTER && idx <= ARMV7_IDX_COUNTER_LAST;
+ return idx >= ARMV7_IDX_CYCLE_COUNTER &&
+ idx <= ARMV7_IDX_COUNTER_LAST(cpu_pmu);
}
static inline int armv7_pmnc_counter_has_overflowed(u32 pmnc, int idx)
{
- int ret = 0;
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u checking wrong counter %d overflow status\n",
- smp_processor_id(), idx);
- } else {
- counter = ARMV7_IDX_TO_COUNTER(idx);
- ret = pmnc & BIT(counter);
- }
-
- return ret;
+ return pmnc & BIT(ARMV7_IDX_TO_COUNTER(idx));
}
static inline int armv7_pmnc_select_counter(int idx)
{
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u selecting wrong PMNC counter %d\n",
- smp_processor_id(), idx);
- return -EINVAL;
- }
-
- counter = ARMV7_IDX_TO_COUNTER(idx);
+ u32 counter = ARMV7_IDX_TO_COUNTER(idx);
asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (counter));
isb();
return idx;
}
-static inline u32 armv7pmu_read_counter(int idx)
+static inline u32 armv7pmu_read_counter(struct perf_event *event)
{
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
u32 value = 0;
- if (!armv7_pmnc_counter_valid(idx))
+ if (!armv7_pmnc_counter_valid(cpu_pmu, idx))
pr_err("CPU%u reading wrong counter %d\n",
smp_processor_id(), idx);
else if (idx == ARMV7_IDX_CYCLE_COUNTER)
@@ -857,9 +841,13 @@ static inline u32 armv7pmu_read_counter(int idx)
return value;
}
-static inline void armv7pmu_write_counter(int idx, u32 value)
+static inline void armv7pmu_write_counter(struct perf_event *event, u32 value)
{
- if (!armv7_pmnc_counter_valid(idx))
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ if (!armv7_pmnc_counter_valid(cpu_pmu, idx))
pr_err("CPU%u writing wrong counter %d\n",
smp_processor_id(), idx);
else if (idx == ARMV7_IDX_CYCLE_COUNTER)
@@ -878,60 +866,28 @@ static inline void armv7_pmnc_write_evtsel(int idx, u32 val)
static inline int armv7_pmnc_enable_counter(int idx)
{
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u enabling wrong PMNC counter %d\n",
- smp_processor_id(), idx);
- return -EINVAL;
- }
-
- counter = ARMV7_IDX_TO_COUNTER(idx);
+ u32 counter = ARMV7_IDX_TO_COUNTER(idx);
asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (BIT(counter)));
return idx;
}
static inline int armv7_pmnc_disable_counter(int idx)
{
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u disabling wrong PMNC counter %d\n",
- smp_processor_id(), idx);
- return -EINVAL;
- }
-
- counter = ARMV7_IDX_TO_COUNTER(idx);
+ u32 counter = ARMV7_IDX_TO_COUNTER(idx);
asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (BIT(counter)));
return idx;
}
static inline int armv7_pmnc_enable_intens(int idx)
{
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u enabling wrong PMNC counter IRQ enable %d\n",
- smp_processor_id(), idx);
- return -EINVAL;
- }
-
- counter = ARMV7_IDX_TO_COUNTER(idx);
+ u32 counter = ARMV7_IDX_TO_COUNTER(idx);
asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (BIT(counter)));
return idx;
}
static inline int armv7_pmnc_disable_intens(int idx)
{
- u32 counter;
-
- if (!armv7_pmnc_counter_valid(idx)) {
- pr_err("CPU%u disabling wrong PMNC counter IRQ enable %d\n",
- smp_processor_id(), idx);
- return -EINVAL;
- }
-
- counter = ARMV7_IDX_TO_COUNTER(idx);
+ u32 counter = ARMV7_IDX_TO_COUNTER(idx);
asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (BIT(counter)));
isb();
/* Clear the overflow flag in case an interrupt is pending. */
@@ -956,7 +912,7 @@ static inline u32 armv7_pmnc_getreset_flags(void)
}
#ifdef DEBUG
-static void armv7_pmnc_dump_regs(void)
+static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu)
{
u32 val;
unsigned int cnt;
@@ -981,7 +937,8 @@ static void armv7_pmnc_dump_regs(void)
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (val));
printk(KERN_INFO "CCNT =0x%08x\n", val);
- for (cnt = ARMV7_IDX_COUNTER0; cnt <= ARMV7_IDX_COUNTER_LAST; cnt++) {
+ for (cnt = ARMV7_IDX_COUNTER0;
+ cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
armv7_pmnc_select_counter(cnt);
asm volatile("mrc p15, 0, %0, c9, c13, 2" : "=r" (val));
printk(KERN_INFO "CNT[%d] count =0x%08x\n",
@@ -993,10 +950,64 @@ static void armv7_pmnc_dump_regs(void)
}
#endif
-static void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void armv7pmu_save_regs(struct arm_pmu *cpu_pmu,
+ struct cpupmu_regs *regs)
+{
+ unsigned int cnt;
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (regs->pmc));
+ if (!(regs->pmc & ARMV7_PMNC_E))
+ return;
+
+ asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (regs->pmcntenset));
+ asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (regs->pmuseren));
+ asm volatile("mrc p15, 0, %0, c9, c14, 1" : "=r" (regs->pmintenset));
+ asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (regs->pmxevtcnt[0]));
+ for (cnt = ARMV7_IDX_COUNTER0;
+ cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
+ armv7_pmnc_select_counter(cnt);
+ asm volatile("mrc p15, 0, %0, c9, c13, 1"
+ : "=r"(regs->pmxevttype[cnt]));
+ asm volatile("mrc p15, 0, %0, c9, c13, 2"
+ : "=r"(regs->pmxevtcnt[cnt]));
+ }
+ return;
+}
+
+static void armv7pmu_restore_regs(struct arm_pmu *cpu_pmu,
+ struct cpupmu_regs *regs)
+{
+ unsigned int cnt;
+ if (!(regs->pmc & ARMV7_PMNC_E))
+ return;
+
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (regs->pmcntenset));
+ asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r" (regs->pmuseren));
+ asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (regs->pmintenset));
+ asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (regs->pmxevtcnt[0]));
+ for (cnt = ARMV7_IDX_COUNTER0;
+ cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
+ armv7_pmnc_select_counter(cnt);
+ asm volatile("mcr p15, 0, %0, c9, c13, 1"
+ : : "r"(regs->pmxevttype[cnt]));
+ asm volatile("mcr p15, 0, %0, c9, c13, 2"
+ : : "r"(regs->pmxevtcnt[cnt]));
+ }
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (regs->pmc));
+}
+
+static void armv7pmu_enable_event(struct perf_event *event)
{
unsigned long flags;
+ struct hw_perf_event *hwc = &event->hw;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
+
+ if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) {
+ pr_err("CPU%u enabling wrong PMNC counter IRQ enable %d\n",
+ smp_processor_id(), idx);
+ return;
+ }
/*
* Enable counter and interrupt, and set the counter to count
@@ -1014,7 +1025,7 @@ static void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
* We only need to set the event for the cycle counter if we
* have the ability to perform event filtering.
*/
- if (armv7pmu.set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER)
+ if (cpu_pmu->set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER)
armv7_pmnc_write_evtsel(idx, hwc->config_base);
/*
@@ -1030,10 +1041,19 @@ static void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void armv7pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void armv7pmu_disable_event(struct perf_event *event)
{
unsigned long flags;
+ struct hw_perf_event *hwc = &event->hw;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
+
+ if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) {
+ pr_err("CPU%u disabling wrong PMNC counter IRQ enable %d\n",
+ smp_processor_id(), idx);
+ return;
+ }
/*
* Disable counter and interrupt
@@ -1057,7 +1077,8 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
{
u32 pmnc;
struct perf_sample_data data;
- struct pmu_hw_events *cpuc;
+ struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+ struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
struct pt_regs *regs;
int idx;
@@ -1077,7 +1098,6 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
*/
regs = get_irq_regs();
- cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -1094,13 +1114,13 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
continue;
hwc = &event->hw;
- armpmu_event_update(event, hwc, idx);
+ armpmu_event_update(event);
perf_sample_data_init(&data, 0, hwc->last_period);
- if (!armpmu_event_set_period(event, hwc, idx))
+ if (!armpmu_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
- cpu_pmu->disable(hwc, idx);
+ cpu_pmu->disable(event);
}
/*
@@ -1115,7 +1135,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
return IRQ_HANDLED;
}
-static void armv7pmu_start(void)
+static void armv7pmu_start(struct arm_pmu *cpu_pmu)
{
unsigned long flags;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -1126,7 +1146,7 @@ static void armv7pmu_start(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void armv7pmu_stop(void)
+static void armv7pmu_stop(struct arm_pmu *cpu_pmu)
{
unsigned long flags;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -1138,10 +1158,12 @@ static void armv7pmu_stop(void)
}
static int armv7pmu_get_event_idx(struct pmu_hw_events *cpuc,
- struct hw_perf_event *event)
+ struct perf_event *event)
{
int idx;
- unsigned long evtype = event->config_base & ARMV7_EVTYPE_EVENT;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long evtype = hwc->config_base & ARMV7_EVTYPE_EVENT;
/* Always place a cycle counter into the cycle counter. */
if (evtype == ARMV7_PERFCTR_CPU_CYCLES) {
@@ -1192,11 +1214,14 @@ static int armv7pmu_set_event_filter(struct hw_perf_event *event,
static void armv7pmu_reset(void *info)
{
+ struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
u32 idx, nb_cnt = cpu_pmu->num_events;
/* The counter and interrupt enable registers are unknown at reset. */
- for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx)
- armv7pmu_disable_event(NULL, idx);
+ for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
+ armv7_pmnc_disable_counter(idx);
+ armv7_pmnc_disable_intens(idx);
+ }
/* Initialize & Reset PMNC: C and P bits */
armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
@@ -1232,17 +1257,22 @@ static int armv7_a7_map_event(struct perf_event *event)
&armv7_a7_perf_cache_map, 0xFF);
}
-static struct arm_pmu armv7pmu = {
- .handle_irq = armv7pmu_handle_irq,
- .enable = armv7pmu_enable_event,
- .disable = armv7pmu_disable_event,
- .read_counter = armv7pmu_read_counter,
- .write_counter = armv7pmu_write_counter,
- .get_event_idx = armv7pmu_get_event_idx,
- .start = armv7pmu_start,
- .stop = armv7pmu_stop,
- .reset = armv7pmu_reset,
- .max_period = (1LLU << 32) - 1,
+static void armv7pmu_init(struct arm_pmu *cpu_pmu)
+{
+ *cpu_pmu = (struct arm_pmu) {
+ .handle_irq = armv7pmu_handle_irq,
+ .enable = armv7pmu_enable_event,
+ .disable = armv7pmu_disable_event,
+ .read_counter = armv7pmu_read_counter,
+ .write_counter = armv7pmu_write_counter,
+ .get_event_idx = armv7pmu_get_event_idx,
+ .start = armv7pmu_start,
+ .stop = armv7pmu_stop,
+ .reset = armv7pmu_reset,
+ .save_regs = armv7pmu_save_regs,
+ .restore_regs = armv7pmu_restore_regs,
+ .max_period = (1LLU << 32) - 1,
+ };
};
static u32 __devinit armv7_read_num_pmnc_events(void)
@@ -1256,70 +1286,75 @@ static u32 __devinit armv7_read_num_pmnc_events(void)
return nb_cnt + 1;
}
-static struct arm_pmu *__devinit armv7_a8_pmu_init(void)
+static int __devinit armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
{
- armv7pmu.name = "ARMv7 Cortex-A8";
- armv7pmu.map_event = armv7_a8_map_event;
- armv7pmu.num_events = armv7_read_num_pmnc_events();
- return &armv7pmu;
+ armv7pmu_init(cpu_pmu);
+ cpu_pmu->name = "ARMv7_Cortex_A8";
+ cpu_pmu->map_event = armv7_a8_map_event;
+ cpu_pmu->num_events = armv7_read_num_pmnc_events();
+ return 0;
}
-static struct arm_pmu *__devinit armv7_a9_pmu_init(void)
+static int __devinit armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
{
- armv7pmu.name = "ARMv7 Cortex-A9";
- armv7pmu.map_event = armv7_a9_map_event;
- armv7pmu.num_events = armv7_read_num_pmnc_events();
- return &armv7pmu;
+ armv7pmu_init(cpu_pmu);
+ cpu_pmu->name = "ARMv7_Cortex_A9";
+ cpu_pmu->map_event = armv7_a9_map_event;
+ cpu_pmu->num_events = armv7_read_num_pmnc_events();
+ return 0;
}
-static struct arm_pmu *__devinit armv7_a5_pmu_init(void)
+static int __devinit armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
{
- armv7pmu.name = "ARMv7 Cortex-A5";
- armv7pmu.map_event = armv7_a5_map_event;
- armv7pmu.num_events = armv7_read_num_pmnc_events();
- return &armv7pmu;
+ armv7pmu_init(cpu_pmu);
+ cpu_pmu->name = "ARMv7_Cortex_A5";
+ cpu_pmu->map_event = armv7_a5_map_event;
+ cpu_pmu->num_events = armv7_read_num_pmnc_events();
+ return 0;
}
-static struct arm_pmu *__devinit armv7_a15_pmu_init(void)
+static int __devinit armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
{
- armv7pmu.name = "ARMv7 Cortex-A15";
- armv7pmu.map_event = armv7_a15_map_event;
- armv7pmu.num_events = armv7_read_num_pmnc_events();
- armv7pmu.set_event_filter = armv7pmu_set_event_filter;
- return &armv7pmu;
+ armv7pmu_init(cpu_pmu);
+ cpu_pmu->name = "ARMv7_Cortex_A15";
+ cpu_pmu->map_event = armv7_a15_map_event;
+ cpu_pmu->num_events = armv7_read_num_pmnc_events();
+ cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
+ return 0;
}
-static struct arm_pmu *__devinit armv7_a7_pmu_init(void)
+static int __devinit armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
{
- armv7pmu.name = "ARMv7 Cortex-A7";
- armv7pmu.map_event = armv7_a7_map_event;
- armv7pmu.num_events = armv7_read_num_pmnc_events();
- armv7pmu.set_event_filter = armv7pmu_set_event_filter;
- return &armv7pmu;
+ armv7pmu_init(cpu_pmu);
+ cpu_pmu->name = "ARMv7_Cortex_A7";
+ cpu_pmu->map_event = armv7_a7_map_event;
+ cpu_pmu->num_events = armv7_read_num_pmnc_events();
+ cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
+ return 0;
}
#else
-static struct arm_pmu *__devinit armv7_a8_pmu_init(void)
+static inline int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit armv7_a9_pmu_init(void)
+static inline int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit armv7_a5_pmu_init(void)
+static inline int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit armv7_a15_pmu_init(void)
+static inline int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit armv7_a7_pmu_init(void)
+static inline int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
#endif /* CONFIG_CPU_V7 */
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 426e19f380a..1d3e1bf4865 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -224,7 +224,8 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
{
unsigned long pmnc;
struct perf_sample_data data;
- struct pmu_hw_events *cpuc;
+ struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+ struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
struct pt_regs *regs;
int idx;
@@ -248,7 +249,6 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
regs = get_irq_regs();
- cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -260,13 +260,13 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
continue;
hwc = &event->hw;
- armpmu_event_update(event, hwc, idx);
+ armpmu_event_update(event);
perf_sample_data_init(&data, 0, hwc->last_period);
- if (!armpmu_event_set_period(event, hwc, idx))
+ if (!armpmu_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
- cpu_pmu->disable(hwc, idx);
+ cpu_pmu->disable(event);
}
irq_work_run();
@@ -280,11 +280,13 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
return IRQ_HANDLED;
}
-static void
-xscale1pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void xscale1pmu_enable_event(struct perf_event *event)
{
unsigned long val, mask, evt, flags;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
switch (idx) {
case XSCALE_CYCLE_COUNTER:
@@ -314,11 +316,13 @@ xscale1pmu_enable_event(struct hw_perf_event *hwc, int idx)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-xscale1pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void xscale1pmu_disable_event(struct perf_event *event)
{
unsigned long val, mask, evt, flags;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
switch (idx) {
case XSCALE_CYCLE_COUNTER:
@@ -348,9 +352,10 @@ xscale1pmu_disable_event(struct hw_perf_event *hwc, int idx)
static int
xscale1pmu_get_event_idx(struct pmu_hw_events *cpuc,
- struct hw_perf_event *event)
+ struct perf_event *event)
{
- if (XSCALE_PERFCTR_CCNT == event->config_base) {
+ struct hw_perf_event *hwc = &event->hw;
+ if (XSCALE_PERFCTR_CCNT == hwc->config_base) {
if (test_and_set_bit(XSCALE_CYCLE_COUNTER, cpuc->used_mask))
return -EAGAIN;
@@ -366,8 +371,7 @@ xscale1pmu_get_event_idx(struct pmu_hw_events *cpuc,
}
}
-static void
-xscale1pmu_start(void)
+static void xscale1pmu_start(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -379,8 +383,7 @@ xscale1pmu_start(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-xscale1pmu_stop(void)
+static void xscale1pmu_stop(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -392,9 +395,10 @@ xscale1pmu_stop(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static inline u32
-xscale1pmu_read_counter(int counter)
+static inline u32 xscale1pmu_read_counter(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
u32 val = 0;
switch (counter) {
@@ -412,9 +416,11 @@ xscale1pmu_read_counter(int counter)
return val;
}
-static inline void
-xscale1pmu_write_counter(int counter, u32 val)
+static inline void xscale1pmu_write_counter(struct perf_event *event, u32 val)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
+
switch (counter) {
case XSCALE_CYCLE_COUNTER:
asm volatile("mcr p14, 0, %0, c1, c0, 0" : : "r" (val));
@@ -434,24 +440,24 @@ static int xscale_map_event(struct perf_event *event)
&xscale_perf_cache_map, 0xFF);
}
-static struct arm_pmu xscale1pmu = {
- .name = "xscale1",
- .handle_irq = xscale1pmu_handle_irq,
- .enable = xscale1pmu_enable_event,
- .disable = xscale1pmu_disable_event,
- .read_counter = xscale1pmu_read_counter,
- .write_counter = xscale1pmu_write_counter,
- .get_event_idx = xscale1pmu_get_event_idx,
- .start = xscale1pmu_start,
- .stop = xscale1pmu_stop,
- .map_event = xscale_map_event,
- .num_events = 3,
- .max_period = (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit xscale1pmu_init(void)
+static int __devinit xscale1pmu_init(struct arm_pmu *cpu_pmu)
{
- return &xscale1pmu;
+ *cpu_pmu = (struct arm_pmu) {
+ .name = "xscale1",
+ .handle_irq = xscale1pmu_handle_irq,
+ .enable = xscale1pmu_enable_event,
+ .disable = xscale1pmu_disable_event,
+ .read_counter = xscale1pmu_read_counter,
+ .write_counter = xscale1pmu_write_counter,
+ .get_event_idx = xscale1pmu_get_event_idx,
+ .start = xscale1pmu_start,
+ .stop = xscale1pmu_stop,
+ .map_event = xscale_map_event,
+ .num_events = 3,
+ .max_period = (1LLU << 32) - 1,
+ };
+
+ return 0;
}
#define XSCALE2_OVERFLOWED_MASK 0x01f
@@ -567,7 +573,8 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
{
unsigned long pmnc, of_flags;
struct perf_sample_data data;
- struct pmu_hw_events *cpuc;
+ struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+ struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
struct pt_regs *regs;
int idx;
@@ -585,7 +592,6 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
regs = get_irq_regs();
- cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
struct perf_event *event = cpuc->events[idx];
struct hw_perf_event *hwc;
@@ -597,13 +603,13 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
continue;
hwc = &event->hw;
- armpmu_event_update(event, hwc, idx);
+ armpmu_event_update(event);
perf_sample_data_init(&data, 0, hwc->last_period);
- if (!armpmu_event_set_period(event, hwc, idx))
+ if (!armpmu_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
- cpu_pmu->disable(hwc, idx);
+ cpu_pmu->disable(event);
}
irq_work_run();
@@ -617,11 +623,13 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
return IRQ_HANDLED;
}
-static void
-xscale2pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void xscale2pmu_enable_event(struct perf_event *event)
{
unsigned long flags, ien, evtsel;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
ien = xscale2pmu_read_int_enable();
evtsel = xscale2pmu_read_event_select();
@@ -661,11 +669,13 @@ xscale2pmu_enable_event(struct hw_perf_event *hwc, int idx)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-xscale2pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void xscale2pmu_disable_event(struct perf_event *event)
{
unsigned long flags, ien, evtsel, of_flags;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+ int idx = hwc->idx;
ien = xscale2pmu_read_int_enable();
evtsel = xscale2pmu_read_event_select();
@@ -713,7 +723,7 @@ xscale2pmu_disable_event(struct hw_perf_event *hwc, int idx)
static int
xscale2pmu_get_event_idx(struct pmu_hw_events *cpuc,
- struct hw_perf_event *event)
+ struct perf_event *event)
{
int idx = xscale1pmu_get_event_idx(cpuc, event);
if (idx >= 0)
@@ -727,8 +737,7 @@ out:
return idx;
}
-static void
-xscale2pmu_start(void)
+static void xscale2pmu_start(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -740,8 +749,7 @@ xscale2pmu_start(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static void
-xscale2pmu_stop(void)
+static void xscale2pmu_stop(struct arm_pmu *cpu_pmu)
{
unsigned long flags, val;
struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -753,9 +761,10 @@ xscale2pmu_stop(void)
raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
-static inline u32
-xscale2pmu_read_counter(int counter)
+static inline u32 xscale2pmu_read_counter(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
u32 val = 0;
switch (counter) {
@@ -779,9 +788,11 @@ xscale2pmu_read_counter(int counter)
return val;
}
-static inline void
-xscale2pmu_write_counter(int counter, u32 val)
+static inline void xscale2pmu_write_counter(struct perf_event *event, u32 val)
{
+ struct hw_perf_event *hwc = &event->hw;
+ int counter = hwc->idx;
+
switch (counter) {
case XSCALE_CYCLE_COUNTER:
asm volatile("mcr p14, 0, %0, c1, c1, 0" : : "r" (val));
@@ -801,33 +812,33 @@ xscale2pmu_write_counter(int counter, u32 val)
}
}
-static struct arm_pmu xscale2pmu = {
- .name = "xscale2",
- .handle_irq = xscale2pmu_handle_irq,
- .enable = xscale2pmu_enable_event,
- .disable = xscale2pmu_disable_event,
- .read_counter = xscale2pmu_read_counter,
- .write_counter = xscale2pmu_write_counter,
- .get_event_idx = xscale2pmu_get_event_idx,
- .start = xscale2pmu_start,
- .stop = xscale2pmu_stop,
- .map_event = xscale_map_event,
- .num_events = 5,
- .max_period = (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit xscale2pmu_init(void)
+static int __devinit xscale2pmu_init(struct arm_pmu *cpu_pmu)
{
- return &xscale2pmu;
+ *cpu_pmu = (struct arm_pmu) {
+ .name = "xscale2",
+ .handle_irq = xscale2pmu_handle_irq,
+ .enable = xscale2pmu_enable_event,
+ .disable = xscale2pmu_disable_event,
+ .read_counter = xscale2pmu_read_counter,
+ .write_counter = xscale2pmu_write_counter,
+ .get_event_idx = xscale2pmu_get_event_idx,
+ .start = xscale2pmu_start,
+ .stop = xscale2pmu_stop,
+ .map_event = xscale_map_event,
+ .num_events = 5,
+ .max_period = (1LLU << 32) - 1,
+ };
+
+ return 0;
}
#else
-static struct arm_pmu *__devinit xscale1pmu_init(void)
+static inline int xscale1pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
-static struct arm_pmu *__devinit xscale2pmu_init(void)
+static inline int xscale2pmu_init(struct arm_pmu *cpu_pmu)
{
- return NULL;
+ return -ENODEV;
}
#endif /* CONFIG_CPU_XSCALE */
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 26c12c6440f..4d34e0e7e94 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -226,6 +226,11 @@ static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
*/
struct cputopo_arm cpu_topology[NR_CPUS];
+int arch_sd_share_power_line(void)
+{
+ return 1*SD_SHARE_POWERLINE;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
@@ -317,6 +322,126 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+
+#ifdef CONFIG_SCHED_HMP
+
+static const char * const little_cores[] = {
+ "arm,cortex-a7",
+ NULL,
+};
+
+static bool is_little_cpu(struct device_node *cn)
+{
+ const char * const *lc;
+ for (lc = little_cores; *lc; lc++)
+ if (of_device_is_compatible(cn, *lc))
+ return true;
+ return false;
+}
+
+void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+ struct cpumask *slow)
+{
+ struct device_node *cn = NULL;
+ int cpu = 0;
+
+ cpumask_clear(fast);
+ cpumask_clear(slow);
+
+ /*
+ * Use the config options if they are given. This helps testing
+ * HMP scheduling on systems without a big.LITTLE architecture.
+ */
+ if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+ if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast))
+ WARN(1, "Failed to parse HMP fast cpu mask!\n");
+ if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow))
+ WARN(1, "Failed to parse HMP slow cpu mask!\n");
+ return;
+ }
+
+ /*
+ * Else, parse device tree for little cores.
+ */
+ while ((cn = of_find_node_by_type(cn, "cpu"))) {
+
+ if (cpu >= num_possible_cpus())
+ break;
+
+ if (is_little_cpu(cn))
+ cpumask_set_cpu(cpu, slow);
+ else
+ cpumask_set_cpu(cpu, fast);
+
+ cpu++;
+ }
+
+ if (!cpumask_empty(fast) && !cpumask_empty(slow))
+ return;
+
+ /*
+ * We didn't find both big and little cores so let's call all cores
+ * fast as this will keep the system running, with all cores being
+ * treated equal.
+ */
+ cpumask_setall(fast);
+ cpumask_clear(slow);
+}
+
+void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
+{
+ struct cpumask hmp_fast_cpu_mask;
+ struct cpumask hmp_slow_cpu_mask;
+ struct hmp_domain *domain;
+
+ arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask);
+
+ /*
+ * Initialize hmp_domains
+ * Must be ordered with respect to compute capacity.
+ * Fastest domain at head of list.
+ */
+ if(!cpumask_empty(&hmp_slow_cpu_mask)) {
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->cpus, &hmp_slow_cpu_mask);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+ }
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->cpus, &hmp_fast_cpu_mask);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+}
+#endif /* CONFIG_SCHED_HMP */
+
+
+/*
+ * cluster_to_logical_mask - return cpu logical mask of CPUs in a cluster
+ * @socket_id: cluster HW identifier
+ * @cluster_mask: the cpumask location to be initialized, modified by the
+ * function only if return value == 0
+ *
+ * Return:
+ *
+ * 0 on success
+ * -EINVAL if cluster_mask is NULL or there is no record matching socket_id
+ */
+int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask)
+{
+ int cpu;
+
+ if (!cluster_mask)
+ return -EINVAL;
+
+ for_each_online_cpu(cpu)
+ if (socket_id == topology_physical_package_id(cpu)) {
+ cpumask_copy(cluster_mask, topology_core_cpumask(cpu));
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
/*
* init_cpu_topology is called at boot when only one cpu is running
* which prevent simultaneous write access to cpu_topology array
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a2496e449b7..065c7209854 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -65,6 +65,7 @@ void build_cpu_to_node_map(void);
| SD_BALANCE_EXEC \
| SD_BALANCE_FORK \
| SD_WAKE_AFFINE, \
+ | arch_sd_share_power_line() \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index d5e86c9f74f..7e9bdfac6f6 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -71,6 +71,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
| 0*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2..01eea702e35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -818,6 +818,7 @@ enum cpu_idle_type {
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_SHARE_POWERLINE 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
@@ -994,6 +995,12 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
bool cpus_share_cache(int this_cpu, int that_cpu);
+#ifdef CONFIG_SCHED_HMP
+struct hmp_domain {
+ struct cpumask cpus;
+ struct list_head hmp_domains;
+};
+#endif /* CONFIG_SCHED_HMP */
#else /* CONFIG_SMP */
struct sched_domain_attr;
@@ -1061,6 +1068,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1095,6 +1103,24 @@ struct load_weight {
unsigned long weight, inv_weight;
};
+struct sched_avg {
+ /*
+ * These sums represent an infinite geometric series and so are bound
+ * above by 1024/(1-y). Thus we only need a u32 to store them for for all
+ * choices of y < 1-2^(-32)*1024.
+ */
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+ s64 decay_count;
+ unsigned long load_avg_contrib;
+ unsigned long load_avg_ratio;
+#ifdef CONFIG_SCHED_HMP
+ u64 hmp_last_up_migration;
+ u64 hmp_last_down_migration;
+#endif
+ u32 usage_avg_sum;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1155,6 +1181,9 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+#ifdef CONFIG_SMP
+ struct sched_avg avg;
+#endif
};
struct sched_rt_entity {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d3cf0d6e771..8e958b2d938 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 1*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
| 0*SD_PREFER_SIBLING \
| arch_sd_sibling_asym_packing() \
@@ -131,6 +132,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
, \
.last_balance = jiffies, \
@@ -161,6 +163,7 @@ int arch_update_cpu_topology(void);
| 1*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
+ | arch_sd_share_power_line() \
| 0*SD_SERIALIZE \
| 1*SD_PREFER_SIBLING \
, \
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 5a8671e8a67..501aa32eb2f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,159 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+ TP_ARGS(tsk, load_contrib),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, load_contrib)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->load_contrib = load_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d load_contrib=%lu",
+ __entry->comm, __entry->pid,
+ __entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+ TP_PROTO(int cpu, unsigned long ratio),
+
+ TP_ARGS(cpu, ratio),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("cpu=%d ratio=%lu",
+ __entry->cpu,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+ TP_PROTO(int cpu, u64 load),
+
+ TP_ARGS(cpu, load),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(u64, load)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ ),
+
+ TP_printk("cpu=%d load=%llu",
+ __entry->cpu,
+ __entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+ TP_PROTO(struct task_struct *tsk, int dest, int force),
+
+ TP_ARGS(tsk, dest, force),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, dest)
+ __field(int, force)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->dest = dest;
+ __entry->force = force;
+ ),
+
+ TP_printk("comm=%s pid=%d dest=%d force=%d",
+ __entry->comm, __entry->pid,
+ __entry->dest, __entry->force)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda71..e34e55868f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1524,6 +1526,14 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_SMP
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#ifdef CONFIG_SCHED_HMP
+ p->se.avg.hmp_last_up_migration = 0;
+ p->se.avg.hmp_last_down_migration = 0;
+#endif
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -5537,6 +5547,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
+ update_packing_domain(cpu);
update_top_cache_domain(cpu);
}
@@ -5813,6 +5824,11 @@ int __weak arch_sd_sibling_asym_packing(void)
return 0*SD_ASYM_PACKING;
}
+int __weak arch_sd_share_power_line(void)
+{
+ return 1*SD_SHARE_POWERLINE;
+}
+
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea..b9d54d0d7bb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +91,13 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -206,14 +219,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic64_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b99..8046acce74d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -28,9 +28,20 @@
#include <linux/interrupt.h>
#include <trace/events/sched.h>
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
#include "sched.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -157,6 +168,76 @@ void sched_init_granularity(void)
update_sysctl();
}
+
+/*
+ * Save the id of the optimal CPU that should be used to pack small tasks
+ * The value -1 is used when no buddy has been found
+ */
+DEFINE_PER_CPU(int, sd_pack_buddy);
+
+/* Look for the best buddy CPU that can be used to pack small tasks
+ * We make the assumption that it doesn't wort to pack on CPU that share the
+ * same powerline. We looks for the 1st sched_domain without the
+ * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
+ * power per core based on the assumption that their power efficiency is
+ * better */
+void update_packing_domain(int cpu)
+{
+ struct sched_domain *sd;
+ int id = -1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE);
+ if (!sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ else
+ if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent)
+ sd = sd->parent;
+
+ while (sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group *pack = sg;
+ struct sched_group *tmp = sg->next;
+
+ /* 1st CPU of the sched domain is a good candidate */
+ if (id == -1)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /* Find sched group of candidate */
+ tmp = sd->groups;
+ do {
+ if (cpumask_test_cpu(id, sched_group_cpus(tmp))) {
+ sg = tmp;
+ break;
+ }
+ } while (tmp = tmp->next, tmp != sd->groups);
+
+ pack = sg;
+ tmp = sg->next;
+
+ /* loop the sched groups to find the best one */
+ while (tmp != sg) {
+ if (tmp->sgp->power * sg->group_weight <
+ sg->sgp->power * tmp->group_weight)
+ pack = tmp;
+ tmp = tmp->next;
+ }
+
+ /* we have found a better group */
+ if (pack != sg)
+ id = cpumask_first(sched_group_cpus(pack));
+
+ /* Look for another CPU than itself */
+ if ((id != cpu)
+ || ((sd->parent) && !(sd->parent->flags && SD_LOAD_BALANCE)))
+ break;
+
+ sd = sd->parent;
+ }
+
+ pr_info("CPU%d packing on CPU%d\n", cpu, id);
+ per_cpu(sd_pack_buddy, cpu) = id;
+}
+
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
@@ -259,6 +340,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -278,6 +362,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -653,9 +739,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -675,10 +758,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +880,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
-
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
-
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
- }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
-
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- return;
-
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
-
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
- }
-
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
- }
-
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
-
- while (cfs_rq->load_period > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
- }
-
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
-}
-
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -876,8 +890,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic64_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -901,27 +915,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -939,6 +937,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -958,18 +958,653 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
+#endif
+ int multiplier; /* used to scale the time delta */
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable,
+ int running,
+ int cpu)
+{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ u64 scaled_delta;
+ u32 scaled_runnable_contrib;
+ int scaled_delta_w;
+ u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
+ delta = now - sa->last_runnable_update;
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ delta = hmp_variable_scale_convert(delta);
+#endif
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta_w = (delta_w * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta_w;
+ if (running)
+ sa->usage_avg_sum += scaled_delta_w;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+ /* decay the load we have accumulated so far */
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+ /* add the contribution from this period */
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ /* Apply load scaling if necessary.
+ * Note that multiplying the whole series is same as
+ * multiplying all terms
+ */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_runnable_contrib = (runnable_contrib * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += scaled_runnable_contrib;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta;
+ if (running)
+ sa->usage_avg_sum += scaled_delta;
+#else
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+ sa->runnable_avg_period += delta;
+
+ return decayed;
+}
+
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count = 0;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ s64 tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic64_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib, usage_contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div64_u64(contrib,
+ atomic64_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = cfs_rq->rq->cpu;
+#endif
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se, cpu))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic64_read(&cfs_rq->removed_load)) {
+ u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ update_cfs_shares(cfs_rq);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ u32 contrib;
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = rq->cpu;
+#endif
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable, cpu);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (rq->avg.runnable_avg_period + 1);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+#endif
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -1096,9 +1731,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1190,9 +1824,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
- se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1206,7 +1839,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ se->on_rq = 0;
}
/*
@@ -1261,6 +1894,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -1340,6 +1974,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1353,9 +1989,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1448,6 +2085,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1592,14 +2238,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq->clock_task -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1611,9 +2252,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq->clock_task;
cfs_rq->throttle_count++;
return 0;
@@ -1628,7 +2269,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1652,7 +2293,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2311,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -2073,8 +2713,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2852,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
+ }
hrtick_update(rq);
}
@@ -2266,12 +2913,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
- if (!se)
+ if (!se) {
dec_nr_running(rq);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2681,6 +3330,362 @@ done:
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int dc, cpu;
+
+ pr_debug("Initializing HMP scheduler:\n");
+
+ /* Initialize hmp_domains using platform code */
+ arch_get_hmp_domains(&hmp_domains);
+ if (list_empty(&hmp_domains)) {
+ pr_debug("HMP domain list is empty!\n");
+ return 0;
+ }
+
+ /* Print hmp_domains */
+ dc = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ cpulist_scnprintf(buf, 64, &domain->cpus);
+ pr_debug(" HMP domain %d: %s\n", dc, buf);
+
+ for_each_cpu_mask(cpu, domain->cpus) {
+ per_cpu(hmp_cpu_domain, cpu) = domain;
+ }
+ dc++;
+ }
+
+ return 1;
+}
+
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ * The default values (512, 256) offer good responsiveness, but may need
+ * tweaking suit particular needs.
+ *
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
+#endif
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_up_migration = 0;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the load_avg_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity load_avg_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static u64 hmp_variable_scale_convert(u64 delta)
+{
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+ low *= hmp_data.multiplier;
+ high *= hmp_data.multiplier;
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ int temp = *(hmp_attr->value);
+ if (hmp_attr->to_sysfs != NULL)
+ temp = hmp_attr->to_sysfs(temp);
+ ret = sprintf(buf, "%d\n", temp);
+ return ret;
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int temp;
+ ssize_t ret = count;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ char *str = vmalloc(count + 1);
+ if (str == NULL)
+ return -ENOMEM;
+ memcpy(str, buf, count);
+ str[count] = 0;
+ if (sscanf(str, "%d", &temp) < 1)
+ ret = -EINVAL;
+ else {
+ if (hmp_attr->from_sysfs != NULL)
+ temp = hmp_attr->from_sysfs(temp);
+ if (temp < 0)
+ ret = -EINVAL;
+ else
+ *(hmp_attr->value) = temp;
+ }
+ vfree(str);
+ return ret;
+}
+
+static int hmp_period_tofrom_sysfs(int value)
+{
+ return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+
+/* max value for threshold is 1024 */
+static int hmp_theshold_from_sysfs(int value)
+{
+ if (value > 1024)
+ return -1;
+ return value;
+}
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
+static void hmp_attr_add(
+ const char *name,
+ int *value,
+ int (*to_sysfs)(int),
+ int (*from_sysfs)(int))
+{
+ int i = 0;
+ while (hmp_data.attributes[i] != NULL) {
+ i++;
+ if (i >= HMP_DATA_SYSFS_MAX)
+ return;
+ }
+ hmp_data.attr[i].attr.mode = 0644;
+ hmp_data.attr[i].show = hmp_show;
+ hmp_data.attr[i].store = hmp_store;
+ hmp_data.attr[i].attr.name = name;
+ hmp_data.attr[i].value = value;
+ hmp_data.attr[i].to_sysfs = to_sysfs;
+ hmp_data.attr[i].from_sysfs = from_sysfs;
+ hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+ hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+ int ret;
+ memset(&hmp_data, sizeof(hmp_data), 0);
+ /* by default load_avg_period_ms == LOAD_AVG_PERIOD
+ * meaning no change
+ */
+ hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
+
+ hmp_attr_add("load_avg_period_ms",
+ &hmp_data.multiplier,
+ hmp_period_tofrom_sysfs,
+ hmp_period_tofrom_sysfs);
+ hmp_attr_add("up_threshold",
+ &hmp_up_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+ hmp_attr_add("down_threshold",
+ &hmp_down_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_freqinvar_from_sysfs);
+#endif
+ hmp_data.attr_group.name = "hmp";
+ hmp_data.attr_group.attrs = hmp_data.attributes;
+ ret = sysfs_create_group(kernel_kobj,
+ &hmp_data.attr_group);
+ return 0;
+}
+late_initcall(hmp_attr_init);
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+#endif /* CONFIG_SCHED_HMP */
+
+static inline bool is_buddy_busy(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * A busy buddy is a CPU with a high load or a small load with a lot of
+ * running tasks.
+ */
+ return ((rq->avg.usage_avg_sum << rq->nr_running) >
+ rq->avg.runnable_avg_period);
+}
+
+static inline bool is_light_task(struct task_struct *p)
+{
+ /* A light task runs less than 25% in average */
+ return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period);
+}
+
+static int check_pack_buddy(int cpu, struct task_struct *p)
+{
+ int buddy = per_cpu(sd_pack_buddy, cpu);
+
+ /* No pack buddy for this CPU */
+ if (buddy == -1)
+ return false;
+
+ /*
+ * If a task is waiting for running on the CPU which is its own buddy,
+ * let the default behavior to look for a better CPU if available
+ * The threshold has been set to 37.5%
+ */
+ if ((buddy == cpu)
+ && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5)))
+ return false;
+
+ /* buddy is not an allowed CPU */
+ if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
+ return false;
+
+ /*
+ * If the task is a small one and the buddy is not overloaded,
+ * we use buddy cpu
+ */
+ if (!is_light_task(p) || is_buddy_busy(buddy))
+ return false;
+
+ return true;
+}
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -2705,6 +3710,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (p->nr_cpus_allowed == 1)
return prev_cpu;
+ if (check_pack_buddy(cpu, p))
+ return per_cpu(sd_pack_buddy, cpu);
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -2779,8 +3787,50 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_up_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ if (hmp_down_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_select_slower_cpu(p, prev_cpu);
+ hmp_next_down_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ /* Make sure that the task stays in its previous hmp domain */
+ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
+ return prev_cpu;
+#endif
+
return new_cpu;
}
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ }
+}
#endif /* CONFIG_SMP */
static unsigned long
@@ -3033,8 +4083,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
#ifdef CONFIG_SMP
/**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -3160,7 +4324,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
-
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -3300,52 +4463,58 @@ next:
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
- return 0;
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
- rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
- rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
@@ -3397,7 +4566,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
@@ -4457,12 +5626,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
+ update_rq_runnable_avg(this_rq, 1);
+
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4581,7 +5752,25 @@ static struct {
static inline int find_new_ilb(int call_cpu)
{
+ struct sched_domain *sd;
int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int buddy = per_cpu(sd_pack_buddy, call_cpu);
+
+ /*
+ * If we have a pack buddy CPU, we try to run load balance on a CPU
+ * that is close to the buddy.
+ */
+ if (buddy != -1)
+ for_each_domain(buddy, sd) {
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ continue;
+
+ ilb = cpumask_first_and(sched_domain_span(sd),
+ nohz.idle_cpus_mask);
+
+ if (ilb < nr_cpu_ids)
+ break;
+ }
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -4717,7 +5906,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -4886,6 +6075,267 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if (p->prio >= hmp_up_prio)
+ return 0;
+#endif
+
+ /* Let the task load settle before doing another up migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_up_migration) >> 10)
+ < hmp_next_up_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio > hmp_up_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if ((p->prio >= hmp_up_prio) &&
+ cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))) {
+ return 1;
+ }
+#endif
+
+ /* Let the task load settle before doing another down migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_down_migration) >> 10)
+ < hmp_next_down_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio < hmp_down_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int cpu;
+ struct sched_entity *curr;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_online_cpu(cpu) {
+ force = 0;
+ target = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr || !entity_is_task(curr)) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ p = task_of(curr);
+ if (hmp_up_migration(cpu, curr)) {
+ if (!target->active_balance) {
+ target->active_balance = 1;
+ target->push_cpu = hmp_select_faster_cpu(p, cpu);
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, 1);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -4897,6 +6347,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -4954,6 +6406,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5046,6 +6500,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (p->se.avg.decay_count) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ __synchronize_entity_decay(&p->se);
+ subtract_blocked_load_contrib(cfs_rq,
+ p->se.avg.load_avg_contrib);
+ }
+#endif
}
/*
@@ -5092,11 +6560,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic64_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct cfs_rq *cfs_rq;
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +6601,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(&p->se);
+ p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6698,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5264,8 +6744,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
update_cfs_shares(group_cfs_rq(se));
+ /* update contribution to parent */
+ update_entity_load_avg(se, 1);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5319,6 +6802,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
@@ -5363,6 +6847,139 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
+
+#ifdef CONFIG_SCHED_HMP
+ hmp_cpu_mask_setup();
+#endif
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ u32 result = curr / max;
+ return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_INCOMPATIBLE)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].max = 1024;
+ freq_scale[ret].min = 1024;
+ freq_scale[ret].curr_scale = 1024;
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfab..b898762f5d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
unsigned long shares;
atomic_t load_weight;
+ atomic64_t load_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,22 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
+#ifdef CONFIG_SMP
/*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ u64 runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter, removed_load;
+ u64 last_decay;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ u32 tg_runnable_contrib, tg_usage_contrib;
+ u64 tg_load_contrib;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* h_load = weight * f(tg)
*
@@ -245,26 +247,30 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -412,6 +418,9 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+#ifdef CONFIG_SCHED_HMP
+ struct task_struct *migrate_task;
+#endif
/* cpu of this runqueue: */
int cpu;
int online;
@@ -467,6 +476,8 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+ struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
@@ -532,6 +543,12 @@ DECLARE_PER_CPU(int, sd_llc_id);
extern int group_balance_cpu(struct sched_group *sg);
+#ifdef CONFIG_SCHED_HMP
+static LIST_HEAD(hmp_domains);
+DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu)))
+#endif /* CONFIG_SCHED_HMP */
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -861,6 +878,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
+extern void update_packing_domain(int cpu);
extern void update_max_interval(void);
extern void update_group_power(struct sched_domain *sd, int cpu);
extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
@@ -1212,4 +1230,3 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 00000000000..8cc2be049a4
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,13 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK=""
+CONFIG_HMP_SLOW_CPU_MASK=""
+CONFIG_HMP_VARIABLE_SCALE=y
+CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y
+CONFIG_SCHED_HMP_PRIO_FILTER=y
+CONFIG_SCHED_HMP_PRIO_FILTER_VAL=5