From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 27 Jan 2010 16:25:22 -0600
Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb
 resume

When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets
the time from hardware such as the TSC on x86. In this
configuration kgdb will report a softlock warning message on
resuming or detaching from a debug session.

Sequence of events in the problem case:

 1) "cpu sched clock" and "hardware time" are at 100 sec prior
    to a call to kgdb_handle_exception()

 2) Debugger waits in kgdb_handle_exception() for 80 sec and on
    exit the following is called ...  touch_softlockup_watchdog() -->
    __raw_get_cpu_var(touch_timestamp) = 0;

 3) "cpu sched clock" = 100s (it was not updated, because the
    interrupt was disabled in kgdb) but the "hardware time" = 180 sec

 4) The first timer interrupt after resuming from
    kgdb_handle_exception updates the watchdog from the "cpu sched clock"

update_process_times() { ...  run_local_timers() -->
softlockup_tick() --> check (touch_timestamp == 0) (it is "YES"
here, we have set "touch_timestamp = 0" at kgdb) -->
__touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp"
to "get_timestamp()" (Here, the "touch_timestamp" will still be
set to 100s.)  ...

    scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched
    clock" to "hardware time" = 180s) ...  }

 5) The Second timer interrupt handler appears to have a large
    jump and trips the softlockup warning.

update_process_times() { ...  run_local_timers() -->
softlockup_tick() --> "cpu sched clock" - "touch_timestamp" =
180s-100s > 60s --> printk "soft lockup error messages" ...  }

note: ***(A) reset "touch_timestamp" to
"get_timestamp(this_cpu)"

Why is "touch_timestamp" 100 sec, instead of 180 sec?

When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of
get_timestamp() is:

get_timestamp(this_cpu)
 -->cpu_clock(this_cpu)
 -->sched_clock_cpu(this_cpu)
 -->__update_sched_clock(sched_clock_data, now)

The __update_sched_clock() function uses the GTOD tick value to
create a window to normalize the "now" values.  So if "now"
value is too big for sched_clock_data, it will be ignored.

The fix is to invoke sched_clock_tick() to update "cpu sched
clock" in order to recover from this state.  This is done by
introducing the function touch_softlockup_watchdog_sync(). This
allows kgdb to request that the sched clock is updated when the
watchdog thread runs the first time after a resume from kgdb.

[yong.zhang0@gmail.com: Use per cpu instead of an array]
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Dongdong Deng <Dongdong.Deng@windriver.com>
Cc: kgdb-bugreport@lists.sourceforge.net
Cc: peterz@infradead.org
LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/kgdb.c         |  6 +++---
 kernel/softlockup.c   | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6f7bba93929..89232151a9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -310,6 +310,7 @@ extern void sched_show_task(struct task_struct *p);
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
+extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 				    void __user *buffer,
@@ -323,6 +324,9 @@ static inline void softlockup_tick(void)
 static inline void touch_softlockup_watchdog(void)
 {
 }
+static inline void touch_softlockup_watchdog_sync(void)
+{
+}
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e2351..87f2cc55755 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs)
 
 	/* Signal the primary CPU that we are done: */
 	atomic_set(&cpu_in_kgdb[cpu], 0);
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sync();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 }
@@ -1450,7 +1450,7 @@ acquirelock:
 	    (kgdb_info[cpu].task &&
 	     kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
 		atomic_set(&kgdb_active, -1);
-		touch_softlockup_watchdog();
+		touch_softlockup_watchdog_sync();
 		clocksource_touch_watchdog();
 		local_irq_restore(flags);
 
@@ -1550,7 +1550,7 @@ kgdb_restore:
 	}
 	/* Free kgdb_active */
 	atomic_set(&kgdb_active, -1);
-	touch_softlockup_watchdog();
+	touch_softlockup_watchdog_sync();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e2..0d4c7898ab8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
 static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
 static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
+void touch_softlockup_watchdog_sync(void)
+{
+	__raw_get_cpu_var(softlock_touch_sync) = true;
+	__raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
+
 void touch_all_softlockup_watchdogs(void)
 {
 	int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
 	}
 
 	if (touch_ts == 0) {
+		if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+			/*
+			 * If the time stamp was touched atomically
+			 * make sure the scheduler tick is up to date.
+			 */
+			per_cpu(softlock_touch_sync, this_cpu) = false;
+			sched_clock_tick();
+		}
 		__touch_softlockup_watchdog();
 		return;
 	}
-- 
cgit v1.2.3


From 5ecb01cfdf96c5f465192bdb2a4fd4a61a24c6cc Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Sat, 23 Jan 2010 22:36:29 +0100
Subject: futex_lock_pi() key refcnt fix

This fixes a futex key reference count bug in futex_lock_pi(),
where a key's reference count is incremented twice but decremented
only once, causing the backing object to not be released.

If the futex is created in a temporary file in an ext3 file system,
this bug causes the file's inode to become an "undead" orphan,
which causes an oops from a BUG_ON() in ext3_put_super() when the
file system is unmounted. glibc's test suite is known to trigger this,
see <http://bugzilla.kernel.org/show_bug.cgi?id=14256>.

The bug is a regression from 2.6.28-git3, namely Peter Zijlstra's
38d47c1b7075bd7ec3881141bb3629da58f88dab "[PATCH] futex: rely on
get_user_pages() for shared futexes". That commit made get_futex_key()
also increment the reference count of the futex key, and updated its
callers to decrement the key's reference count before returning.
Unfortunately the normal exit path in futex_lock_pi() wasn't corrected:
the reference count is incremented by get_futex_key() and queue_lock(),
but the normal exit path only decrements once, via unqueue_me_pi().
The fix is to put_futex_key() after unqueue_me_pi(), since 2.6.31
this is easily done by 'goto out_put_key' rather than 'goto out'.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9..17828033a63 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1971,7 +1971,7 @@ retry_private:
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-	goto out;
+	goto out_put_key;
 
 out_unlock_put_key:
 	queue_unlock(&q, hb);
-- 
cgit v1.2.3


From 51246bfd189064079c54421507236fd2723b18f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 2 Feb 2010 11:40:27 +0100
Subject: futex: Handle user space corruption gracefully

If the owner of a PI futex dies we fix up the pi_state and set
pi_state->owner to NULL. When a malicious or just sloppy programmed
user space application sets the futex value to 0 e.g. by calling
pthread_mutex_init(), then the futex can be acquired again. A new
waiter manages to enqueue itself on the pi_state w/o damage, but on
unlock the kernel dereferences pi_state->owner and oopses.

Prevent this by checking pi_state->owner in the unlock path. If
pi_state->owner is not current we know that user space manipulated the
futex value. Ignore the mess and return -EINVAL.

This catches the above case and also the case where a task hijacks the
futex by setting the tid value and then tries to unlock it.

Reported-by: Jermome Marchand <jmarchan@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index 17828033a63..06e8240d2ab 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -758,6 +758,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (!pi_state)
 		return -EINVAL;
 
+	/*
+	 * If current does not own the pi_state then the futex is
+	 * inconsistent and user space fiddled with the futex value.
+	 */
+	if (pi_state->owner != current)
+		return -EINVAL;
+
 	raw_spin_lock(&pi_state->pi_mutex.wait_lock);
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
-- 
cgit v1.2.3


From 59647b6ac3050dd964bc556fe6ef22f4db5b935c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Feb 2010 09:33:05 +0100
Subject: futex: Handle futex value corruption gracefully

The WARN_ON in lookup_pi_state which complains about a mismatch
between pi_state->owner->pid and the pid which we retrieved from the
user space futex is completely bogus.

The code just emits the warning and then continues despite the fact
that it detected an inconsistent state of the futex. A conveniant way
for user space to spam the syslog.

Replace the WARN_ON by a consistency check. If the values do not match
return -EINVAL and let user space deal with the mess it created.

This also fixes the missing task_pid_vnr() when we compare the
pi_state->owner pid with the futex value.

Reported-by: Jermome Marchand <jmarchan@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
---
 kernel/futex.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 06e8240d2ab..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 				return -EINVAL;
 
 			WARN_ON(!atomic_read(&pi_state->refcount));
-			WARN_ON(pid && pi_state->owner &&
-				pi_state->owner->pid != pid);
+
+			/*
+			 * When pi_state->owner is NULL then the owner died
+			 * and another waiter is on the fly. pi_state->owner
+			 * is fixed up by the task which acquires
+			 * pi_state->rt_mutex.
+			 *
+			 * We do not check for pid == 0 which can happen when
+			 * the owner died and robust_list_exit() cleared the
+			 * TID.
+			 */
+			if (pid && pi_state->owner) {
+				/*
+				 * Bail out if user space manipulated the
+				 * futex value.
+				 */
+				if (pid != task_pid_vnr(pi_state->owner))
+					return -EINVAL;
+			}
 
 			atomic_inc(&pi_state->refcount);
 			*ps = pi_state;
-- 
cgit v1.2.3