From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 15:52:09 +0200 Subject: sched/nohz: Rewrite and fix load-avg computation -- again Thanks to Charles Wang for spotting the defects in the current code: - If we go idle during the sample window -- after sampling, we get a negative bias because we can negate our own sample. - If we wake up during the sample window we get a positive bias because we push the sample to a known active period. So rewrite the entire nohz load-avg muck once again, now adding copious documentation to the code. Reported-and-tested-by: Doug Smythies Reported-and-tested-by: Charles Wang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins [ minor edits ] Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3 From 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:20 -0400 Subject: timekeeping: Fix leapsecond triggered load spike issue The timekeeping code misses an update of the hrtimer subsystem after a leap second happened. Due to that timers based on CLOCK_REALTIME are either expiring a second early or late depending on whether a leap second has been inserted or deleted until an operation is initiated which causes that update. Unless the update happens by some other means this discrepancy between the timekeeping and the hrtimer data stays forever and timers are expired either early or late. The reported immediate workaround - $ data -s "`date`" - is causing a call to clock_was_set() which updates the hrtimer data structures. See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix Add the missing clock_was_set() call to update_wall_time() in case of a leap second event. The actual update is deferred to softirq context as the necessary smp function call cannot be invoked from hard interrupt context. Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..a413e5940e06 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -963,6 +963,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } /* Accumulate raw time */ @@ -1079,6 +1081,8 @@ static void update_wall_time(void) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } timekeeping_update(false); -- cgit v1.2.3 From 5b9fe759a678e05be4937ddf03d50e950207c1c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:21 -0400 Subject: timekeeping: Maintain ktime_t based offsets for hrtimers We need to update the hrtimer clock offsets from the hrtimer interrupt context. To avoid conversions from timespec to ktime_t maintain a ktime_t based representation of those offsets in the timekeeper. This puts the conversion overhead into the code which updates the underlying offsets and provides fast accessible values in the hrtimer interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a413e5940e06..1c038dac71a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,12 @@ struct timekeeper { /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ struct timespec raw_time; + /* Offset clock monotonic -> clock realtime */ + ktime_t offs_real; + + /* Offset clock monotonic -> clock boottime */ + ktime_t offs_boot; + /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } +static void update_rt_offset(void) +{ + struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + + set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); + timekeeper.offs_real = timespec_to_ktime(tmp); +} + /* must hold write on timekeeper.lock */ static void timekeeping_update(bool clearntp) { @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) timekeeper.ntp_error = 0; ntp_clear(); } + update_rt_offset(); update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -604,6 +619,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_rt_offset(); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -612,6 +628,12 @@ void __init timekeeping_init(void) /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ + timekeeper.total_sleep_time = t; + timekeeper.offs_boot = timespec_to_ktime(t); +} + /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); - timekeeper.total_sleep_time = timespec_add( - timekeeper.total_sleep_time, *delta); + update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); } -- cgit v1.2.3 From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:24 -0400 Subject: timekeeping: Provide hrtimer update function To finally fix the infamous leap second issue and other race windows caused by functions which change the offsets between the various time bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a function which atomically gets the current monotonic time and updates the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic overhead. The previous patch which provides ktime_t offsets allows us to make this function almost as cheap as ktime_get() which is going to be replaced in hrtimer_interrupt(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1c038dac71a2..269b1fe5f2ae 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1271,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&timekeeper.lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&timekeeper.lock); + + secs = timekeeper.xtime.tv_sec; + nsecs = timekeeper.xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *offs_real = timekeeper.offs_real; + *offs_boot = timekeeper.offs_boot; + } while (read_seqretry(&timekeeper.lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ -- cgit v1.2.3