From 57430218317e5b280a80582a139b26029c25de6c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:01 +0200 Subject: sched/cputime: Count actually elapsed irq & softirq time Currently, if there was any irq or softirq time during 'ticks' jiffies, the entire period will be accounted as irq or softirq time. This is inaccurate if only a subset of the time was actually spent handling irqs, and could conceivably mis-count all of the ticks during a period as irq time, when there was some irq and some softirq time. This can actually happen when irqtime_account_process_tick is called from account_idle_ticks, which can pass a larger number of ticks down all at once. Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(), and steal_account_process_ticks() to work with cputime_t time units, and return the amount of time spent in each mode. Rename steal_account_process_ticks() to steal_account_process_time(), to reflect that time is now accounted in cputime_t, instead of ticks. Additionally, have irqtime_account_process_tick() take into account how much time was spent in each of steal, irq, and softirq time. The latter could help improve the accuracy of cputime accounting when returning from idle on a NO_HZ_IDLE CPU. Properly accounting how much time was spent in hardirq and softirq time will also allow the NO_HZ_FULL code to re-use these same functions for hardirq and softirq accounting. Signed-off-by: Rik van Riel [ Make nsecs_to_cputime64() actually return cputime64_t. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 124 ++++++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3d60e5d76fdb..db82ae12cf01 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr) } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,31 +267,44 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif return 0; } +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; +} + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -342,21 +365,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick(ULONG_MAX)) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(cputime); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -466,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,16 +502,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick(ULONG_MAX)) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } /* @@ -681,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta_jiffies, steal_jiffies; + cputime_t delta, steal; - delta_jiffies = now - tsk->vtime_snap; - steal_jiffies = steal_account_process_tick(delta_jiffies); + delta = jiffies_to_cputime(now - tsk->vtime_snap); + steal = steal_account_process_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta_jiffies - steal_jiffies); + return delta - steal; } static void __vtime_account_system(struct task_struct *tsk) -- cgit v1.2.3 From b58c35840521bb02b150e1d0d34ca9197f8b7145 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:02 +0200 Subject: sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not appear to currently work right. On CPUs without nohz_full=, only tick based irq time sampling is done, which breaks down when dealing with a nohz_idle CPU. On firewalls and similar systems, no ticks may happen on a CPU for a while, and the irq time spent may never get accounted properly. This can cause issues with capacity planning and power saving, which use the CPU statistics as inputs in decision making. Remove the VTIME_GEN vtime irq time code, and replace it with the IRQ_TIME_ACCOUNTING code, when selected as a config option by the user. Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index db82ae12cf01..ca7e33cb0967 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - cputime_t delta, steal; + cputime_t delta, other; delta = jiffies_to_cputime(now - tsk->vtime_snap); - steal = steal_account_process_time(delta); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return delta - steal; + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -738,16 +738,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; -- cgit v1.2.3 From 0cfdf9a198b0d4f5ad6c87d894db7830b796b2cc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Jul 2016 16:50:03 +0200 Subject: sched/cputime: Clean up the old vtime gen irqtime accounting completely Vtime generic irqtime accounting has been removed but there are a few remnants to clean up: * The vtime_accounting_cpu_enabled() check in irq entry was only used by CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can safely remove it. * Without the vtime_accounting_cpu_enabled(), we no longer need to have a vtime_common_account_irq_enter() indirect function. * Move vtime_account_irq_enter() implementation under CONFIG_VIRT_CPU_ACCOUNTING_NATIVE which is the last user. * The vtime_account_user() call was only used on irq entry for CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can remove that too. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ca7e33cb0967..16a873c203b1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -431,6 +431,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -440,33 +444,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; -- cgit v1.2.3 From 553bf6bbfd8a540c70aee28eb50e24caff456a03 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:05 +0200 Subject: sched/cputime: Drop local_irq_save/restore from irqtime_account_irq() Paolo pointed out that irqs are already blocked when irqtime_account_irq() is called. That means there is no reason to call local_irq_save/restore() again. Suggested-by: Paolo Bonzini Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Reviewed-by: Paolo Bonzini Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 16a873c203b1..ea0f6f31a244 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -- cgit v1.2.3