aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c320
1 files changed, 290 insertions, 30 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c7102fe3a5c..0d6c21b1f4bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -34,10 +34,17 @@
#ifdef CONFIG_HMP_VARIABLE_SCALE
#include <linux/sysfs.h>
#include <linux/vmalloc.h>
-#endif
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
#include "sched.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -1205,8 +1212,93 @@ static u32 __compute_runnable_contrib(u64 n)
}
#ifdef CONFIG_HMP_VARIABLE_SCALE
-static u64 hmp_variable_scale_convert(u64 delta);
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
#endif
+ int multiplier; /* used to scale the time delta */
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
/* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
@@ -1237,11 +1329,18 @@ static u64 hmp_variable_scale_convert(u64 delta);
static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
int runnable,
- int running)
+ int running,
+ int cpu)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ u64 scaled_delta;
+ u32 scaled_runnable_contrib;
+ int scaled_delta_w;
+ u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
delta = now - sa->last_runnable_update;
#ifdef CONFIG_HMP_VARIABLE_SCALE
@@ -1265,6 +1364,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
return 0;
sa->last_runnable_update = now;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->runnable_avg_period % 1024;
if (delta + delta_w >= 1024) {
@@ -1277,10 +1382,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
* period and accrue it.
*/
delta_w = 1024 - delta_w;
+ /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta_w = (delta_w * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta_w;
+ if (running)
+ sa->usage_avg_sum += scaled_delta_w;
+#else
if (runnable)
sa->runnable_avg_sum += delta_w;
if (running)
sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta_w;
delta -= delta_w;
@@ -1288,27 +1403,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
-
+ /* decay the load we have accumulated so far */
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
periods + 1);
sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
-
+ /* add the contribution from this period */
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
+ /* Apply load scaling if necessary.
+ * Note that multiplying the whole series is same as
+ * multiplying all terms
+ */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_runnable_contrib = (runnable_contrib * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += scaled_runnable_contrib;
+#else
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
if (running)
sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
+ /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta;
+ if (running)
+ sa->usage_avg_sum += scaled_delta;
+#else
if (runnable)
sa->runnable_avg_sum += delta;
if (running)
sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta;
return decayed;
@@ -1487,7 +1624,7 @@ static inline void update_entity_load_avg(struct sched_entity *se,
now = cfs_rq_clock_task(group_cfs_rq(se));
if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
- cfs_rq->curr == se))
+ cfs_rq->curr == se, se->cfs_rq->rq->cpu))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1533,7 +1670,7 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
u32 contrib;
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
- runnable);
+ runnable, rq->cpu);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
contrib /= (rq->avg.runnable_avg_period + 1);
@@ -3503,27 +3640,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
* delta time by 1/22 and setting load_avg_period_ms = 706.
*/
-#define HMP_VARIABLE_SCALE_SHIFT 16ULL
-struct hmp_global_attr {
- struct attribute attr;
- ssize_t (*show)(struct kobject *kobj,
- struct attribute *attr, char *buf);
- ssize_t (*store)(struct kobject *a, struct attribute *b,
- const char *c, size_t count);
- int *value;
- int (*to_sysfs)(int);
- int (*from_sysfs)(int);
-};
-
-#define HMP_DATA_SYSFS_MAX 3
-
-struct hmp_data_struct {
- int multiplier; /* used to scale the time delta */
- struct attribute_group attr_group;
- struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
- struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
-} hmp_data;
-
/*
* By scaling the delta time it end-up increasing or decrease the
* growing speed of the per entity load_avg_ratio
@@ -3591,7 +3707,15 @@ static int hmp_theshold_from_sysfs(int value)
return -1;
return value;
}
-
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
static void hmp_attr_add(
const char *name,
int *value,
@@ -3636,7 +3760,14 @@ static int hmp_attr_init(void)
&hmp_down_threshold,
NULL,
hmp_theshold_from_sysfs);
-
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_freqinvar_from_sysfs);
+#endif
hmp_data.attr_group.name = "hmp";
hmp_data.attr_group.attrs = hmp_data.attributes;
ret = sysfs_create_group(kernel_kobj,
@@ -6802,3 +6933,132 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ u32 result = curr / max;
+ return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_INCOMPATIBLE)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].max = 1024;
+ freq_scale[ret].min = 1024;
+ freq_scale[ret].curr_scale = 1024;
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */