From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 27 Jan 2010 16:25:22 -0600 Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kgdb.c | 6 +++--- kernel/softlockup.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7bba93929..89232151a9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -310,6 +310,7 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); +extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void __user *buffer, @@ -323,6 +324,9 @@ static inline void softlockup_tick(void) static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) +{ +} static inline void touch_all_softlockup_watchdogs(void) { } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e2351..87f2cc55755 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1450,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1550,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e2..0d4c7898ab8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } -- cgit v1.2.3 From 5ecb01cfdf96c5f465192bdb2a4fd4a61a24c6cc Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sat, 23 Jan 2010 22:36:29 +0100 Subject: futex_lock_pi() key refcnt fix This fixes a futex key reference count bug in futex_lock_pi(), where a key's reference count is incremented twice but decremented only once, causing the backing object to not be released. If the futex is created in a temporary file in an ext3 file system, this bug causes the file's inode to become an "undead" orphan, which causes an oops from a BUG_ON() in ext3_put_super() when the file system is unmounted. glibc's test suite is known to trigger this, see . The bug is a regression from 2.6.28-git3, namely Peter Zijlstra's 38d47c1b7075bd7ec3881141bb3629da58f88dab "[PATCH] futex: rely on get_user_pages() for shared futexes". That commit made get_futex_key() also increment the reference count of the futex key, and updated its callers to decrement the key's reference count before returning. Unfortunately the normal exit path in futex_lock_pi() wasn't corrected: the reference count is incremented by get_futex_key() and queue_lock(), but the normal exit path only decrements once, via unqueue_me_pi(). The fix is to put_futex_key() after unqueue_me_pi(), since 2.6.31 this is easily done by 'goto out_put_key' rather than 'goto out'. Signed-off-by: Mikael Pettersson Acked-by: Peter Zijlstra Acked-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9..17828033a63 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1971,7 +1971,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); -- cgit v1.2.3 From 51246bfd189064079c54421507236fd2723b18f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Feb 2010 11:40:27 +0100 Subject: futex: Handle user space corruption gracefully If the owner of a PI futex dies we fix up the pi_state and set pi_state->owner to NULL. When a malicious or just sloppy programmed user space application sets the futex value to 0 e.g. by calling pthread_mutex_init(), then the futex can be acquired again. A new waiter manages to enqueue itself on the pi_state w/o damage, but on unlock the kernel dereferences pi_state->owner and oopses. Prevent this by checking pi_state->owner in the unlock path. If pi_state->owner is not current we know that user space manipulated the futex value. Ignore the mess and return -EINVAL. This catches the above case and also the case where a task hijacks the futex by setting the tid value and then tries to unlock it. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index 17828033a63..06e8240d2ab 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -758,6 +758,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); -- cgit v1.2.3 From 59647b6ac3050dd964bc556fe6ef22f4db5b935c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Feb 2010 09:33:05 +0100 Subject: futex: Handle futex value corruption gracefully The WARN_ON in lookup_pi_state which complains about a mismatch between pi_state->owner->pid and the pid which we retrieved from the user space futex is completely bogus. The code just emits the warning and then continues despite the fact that it detected an inconsistent state of the futex. A conveniant way for user space to spam the syslog. Replace the WARN_ON by a consistency check. If the values do not match return -EINVAL and let user space deal with the mess it created. This also fixes the missing task_pid_vnr() when we compare the pi_state->owner pid with the futex value. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 06e8240d2ab..e7a35f1039e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; -- cgit v1.2.3