From e40b17208b6805be50ffe891878662b6076206b9 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:03 -0500
Subject: x86: Move notify_die from nmi.c to traps.c

In order to handle a new nmi_watchdog approach, I need to move
the notify_die() routine out of nmi_watchdog_tick() and into
default_do_nmi(). This lets me easily swap out the old
nmi_watchdog with the new one with just a config change.

The change probably makes sense from a high level perspective
because the nmi_watchdog shouldn't be handling notify_die
routines anyway.  However, this move does change the semantics a
little bit.  Instead of checking on every nmi interrupt if the
cpus are stuck, only check them on the nmi_watchdog interrupts.

 v2: Move notify_die call into #idef block

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 7 -------
 arch/x86/kernel/traps.c    | 5 +++++
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396c..5d47682f580 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -400,13 +400,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 	int cpu = smp_processor_id();
 	int rc = 0;
 
-	/* check for other users first */
-	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-			== NOTIFY_STOP) {
-		rc = 1;
-		touched = 1;
-	}
-
 	sum = get_timer_irqs(cpu);
 
 	if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..51ef893ffa6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -400,7 +400,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
 								== NOTIFY_STOP)
 			return;
+
 #ifdef CONFIG_X86_LOCAL_APIC
+	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+	        			                == NOTIFY_STOP)
+	                return;
+
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
-- 
cgit v1.2.3


From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:04 -0500
Subject: nmi_watchdog: Add new, generic implementation, using perf events

This is a new generic nmi_watchdog implementation using the perf
events infrastructure as suggested by Ingo.

The implementation is simple, just create an in-kernel perf
event and register an overflow handler to check for cpu lockups.

I created a generic implementation that lives in kernel/ and
the hardware specific part that for now lives in arch/x86.

This approach has a number of advantages:

 - It simplifies the x86 PMU implementation in the long run,
   in that it removes the hardcoded low-level PMU implementation
   that was the NMI watchdog before.

 - It allows new NMI watchdog features to be added in a central
   place.

 - It allows other architectures to enable the NMI watchdog,
   as long as they have perf events (that provide NMIs)
   implemented.

 - It also allows for more graceful co-existence of existing
   perf events apps and the NMI watchdog - before these changes
   the relationship was exclusive. (The NMI watchdog will 'spend'
   a perf event when enabled. In later iterations we might be
   able to piggyback from an existing NMI event without having
   to allocate a hardware event for the NMI watchdog - turning
   this into a no-hardware-cost feature.)

As for compatibility, we'll keep the old NMI watchdog code as
well until the new one can 100% replace it on all CPUs, old and
new alike.  That might take some time as the NMI watchdog has
been ported to many CPU models.

I have done light testing to make sure the framework works
correctly and it does.

 v2: Set the correct timeout values based on the old nmi
     watchdog

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 arch/x86/kernel/apic/hw_nmi.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 00000000000..8c0e6a410d0
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,114 @@
+/*
+ *  HW NMI watchdog support
+ *
+ *  started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ *  Arch specific calls to support NMI watchdog
+ *
+ *  Bits copied from original nmi.c file
+ *
+ */
+
+#include <asm/apic.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <asm/mce.h>
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+        return per_cpu(irq_stat, cpu).apic_timer_irqs +
+                per_cpu(irq_stat, cpu).irq0_irqs;
+}
+
+static inline int mce_in_progress(void)
+{
+#if defined(CONFIG_X86_MCE)
+        return atomic_read(&mce_entry) > 0;
+#endif
+        return 0;
+}
+
+int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
+{
+	unsigned int sum;
+	int cpu = smp_processor_id();
+
+	/* FIXME: cheap hack for this check, probably should get its own
+	 * die_notifier handler
+	 */
+	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+		show_regs(regs);
+		dump_stack();
+		spin_unlock(&lock);
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+	}
+
+	/* if we are doing an mce, just assume the cpu is not stuck */
+        /* Could check oops_in_progress here too, but it's safer not to */
+        if (mce_in_progress())
+                return 0;
+
+	/* We determine if the cpu is stuck by checking whether any
+	 * interrupts have happened since we last checked.  Of course
+	 * an nmi storm could create false positives, but the higher
+	 * level logic should account for that
+	 */
+	sum = get_timer_irqs(cpu);
+	if (__get_cpu_var(last_irq_sum) == sum) {
+		return 1;
+	} else {
+		__get_cpu_var(last_irq_sum) = sum;
+		return 0;
+	}
+}
+
+void arch_trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+	printk(KERN_INFO "sending NMI to all CPUs:\n");
+	apic->send_IPI_all(NMI_VECTOR);
+
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpumask_empty(to_cpumask(backtrace_mask)))
+			break;
+		mdelay(1);
+	}
+}
+
+/* STUB calls to mimic old nmi_watchdog behaviour */
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int nmi_watchdog_enabled;
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
-- 
cgit v1.2.3


From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:05 -0500
Subject: nmi_watchdog: Config option to enable new nmi_watchdog

These are the bits that enable the new nmi_watchdog and safely
isolate the old nmi_watchdog.  Only one or the other can run,
not both at the same time.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/Makefile | 7 ++++++-
 arch/x86/kernel/traps.c       | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507..1a4512e48d2 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_NMI_WATCHDOG),y)
+obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
+endif
+obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 51ef893ffa6..973cbc4f044 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 	        			                == NOTIFY_STOP)
 	                return;
 
+#ifndef CONFIG_NMI_WATCHDOG
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_NMI_WATCHDOG */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
-- 
cgit v1.2.3


From c3128fb6ad39b0edda6675d20585a64846cf89ea Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Feb 2010 17:19:18 -0500
Subject: nmi_watchdog: Use a boolean config flag for compiling

Determines if an arch has setup arch specific perf_events and
nmi_watchdog code.  This should restrict compiles to only those
arches ready.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266013161-31197-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cbcbfdee3ee..4f9685fa3a3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -52,6 +52,7 @@ config X86
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
 	select PERF_EVENTS
+	select PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
-- 
cgit v1.2.3


From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Feb 2010 17:19:19 -0500
Subject: nmi_watchdog: Compile and portability fixes

The original patch was x86_64 centric.  Changed the code to make
it less so.

ested by building and running on a powerpc.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/nmi.h    |  2 ++
 arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f334..5b41b0feb6d 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_NMI_WATCHDOG)
 extern int nmi_watchdog_enabled;
+#endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c0e6a410d0..312d772c5c3 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum);
  */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-        return per_cpu(irq_stat, cpu).apic_timer_irqs +
-                per_cpu(irq_stat, cpu).irq0_irqs;
+	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
+
+#if defined(CONFIG_X86_LOCAL_APIC)
+	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+
+        return irqs;
 }
 
 static inline int mce_in_progress(void)
@@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 }
 
+u64 hw_nmi_get_sample_period(void)
+{
+        return cpu_khz * 1000;
+}
+
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void)
 }
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
 unsigned int nmi_watchdog = NMI_NONE;
 EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
-int nmi_watchdog_enabled;
 int unknown_nmi_panic;
 void cpu_nmi_set_wd_enabled(void) { return; }
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
 void stop_apic_nmi_watchdog(void *unused) { return; }
 void setup_apic_nmi_watchdog(void *unused) { return; }
 int __init check_nmi_watchdog(void) { return 0; }
-- 
cgit v1.2.3


From 2cc4452bc31fc1cde6f0b64a4eb13269f982787d Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 18 Feb 2010 21:56:52 -0500
Subject: nmi_watchdog: Fix undefined 'apic' build bug

Ingo provided me a config that fails to compile with:

  arch/x86/built-in.o: In function
  `arch_trigger_all_cpu_backtrace': (.text+0x17e78): undefined
  reference to `apic' make: *** [.tmp_vmlinux1] Error 1

I realized I changed the compile behaviour of the nmi code by
not wrapping it with CONFIG_LOCAL_APIC.  To fix this I add a
compile check for ARCH_HAS_NMI_WATCHDOG around
arch_trigger_all_cpu_backtrace.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: a.p.zijlstra@chello.nl
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266548212-24243-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 312d772c5c3..0b4d205a6b8 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -92,6 +92,7 @@ u64 hw_nmi_get_sample_period(void)
         return cpu_khz * 1000;
 }
 
+#ifdef ARCH_HAS_NMI_WATCHDOG
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -108,6 +109,7 @@ void arch_trigger_all_cpu_backtrace(void)
 		mdelay(1);
 	}
 }
+#endif
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
 #if defined(CONFIG_X86_LOCAL_APIC)
-- 
cgit v1.2.3


From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 22 Feb 2010 18:09:03 -0500
Subject: nmi_watchdog: Clean up various small details

Mostly copy/paste whitespace damage with a couple of nitpicks by
the checkpatch script. Fix the struct definition as requested by Ingo too.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/apic/hw_nmi.c |   14 +++++------
 arch/x86/kernel/traps.c       |    6 ++--
 include/linux/nmi.h           |    2 -
 kernel/nmi_watchdog.c         |   51 ++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)
---
 arch/x86/kernel/apic/hw_nmi.c | 14 +++++++-------
 arch/x86/kernel/traps.c       |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 0b4d205a6b8..e8b78a0be5d 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu)
 	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
 #endif
 
-        return irqs;
+	return irqs;
 }
 
 static inline int mce_in_progress(void)
 {
 #if defined(CONFIG_X86_MCE)
-        return atomic_read(&mce_entry) > 0;
+	return atomic_read(&mce_entry) > 0;
 #endif
-        return 0;
+	return 0;
 }
 
 int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
@@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 
 	/* if we are doing an mce, just assume the cpu is not stuck */
-        /* Could check oops_in_progress here too, but it's safer not to */
-        if (mce_in_progress())
-                return 0;
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
+		return 0;
 
 	/* We determine if the cpu is stuck by checking whether any
 	 * interrupts have happened since we last checked.  Of course
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-        return cpu_khz * 1000;
+	return cpu_khz * 1000;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 973cbc4f044..bdc7fab3ef3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 			return;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-	        			                == NOTIFY_STOP)
-	                return;
+		if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 
 #ifndef CONFIG_NMI_WATCHDOG
 		/*
-- 
cgit v1.2.3


From 45c34e05c4e3d36e7c44e790241ea11a1d90d54e Mon Sep 17 00:00:00 2001
From: John Villalovos <sodarock@gmail.com>
Date: Fri, 7 May 2010 12:41:40 -0400
Subject: Oprofile: Change CPUIDS from decimal to hex, and add some comments

Back when the patch was submitted for "Add Xeon 7500 series support to
oprofile", Robert Richter had asked for a followon patch that
converted all the CPU ID values to hex.

I have done that here for the "i386/core_i7" and "i386/atom" class
processors in the ppro_init() function and also added some comments on
where to find documentation on the Intel processors.

Signed-off-by: John L. Villalovos <john.l.villalovos@intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b28d2f1253b..1ba67dc8006 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type)
 	if (force_arch_perfmon && cpu_has_arch_perfmon)
 		return 0;
 
+	/*
+	 * Documentation on identifying Intel processors by CPU family
+	 * and model can be found in the Intel Software Developer's
+	 * Manuals (SDM):
+	 *
+	 *  http://www.intel.com/products/processor/manuals/
+	 *
+	 * As of May 2010 the documentation for this was in the:
+	 * "Intel 64 and IA-32 Architectures Software Developer's
+	 * Manual Volume 3B: System Programming Guide", "Table B-1
+	 * CPUID Signature Values of DisplayFamily_DisplayModel".
+	 */
 	switch (cpu_model) {
 	case 0 ... 2:
 		*cpu_type = "i386/ppro";
@@ -655,12 +667,12 @@ static int __init ppro_init(char **cpu_type)
 	case 15: case 23:
 		*cpu_type = "i386/core_2";
 		break;
+	case 0x1a:
 	case 0x2e:
-	case 26:
 		spec = &op_arch_perfmon_spec;
 		*cpu_type = "i386/core_i7";
 		break;
-	case 28:
+	case 0x1c:
 		*cpu_type = "i386/atom";
 		break;
 	default:
-- 
cgit v1.2.3


From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:44 -0400
Subject: lockup_detector: Combine nmi_watchdog and softlockup detector

The new nmi_watchdog (which uses the perf event subsystem) is very
similar in structure to the softlockup detector.  Using Ingo's
suggestion, I combined the two functionalities into one file:
kernel/watchdog.c.

Now both the nmi_watchdog (or hardlockup detector) and softlockup
detector sit on top of the perf event subsystem, which is run every
60 seconds or so to see if there are any lockups.

To detect hardlockups, cpus not responding to interrupts, I
implemented an hrtimer that runs 5 times for every perf event
overflow event.  If that stops counting on a cpu, then the cpu is
most likely in trouble.

To detect softlockups, tasks not yielding to the scheduler, I used the
previous kthread idea that now gets kicked every time the hrtimer fires.
If the kthread isn't being scheduled neither is anyone else and the
warning is printed to the console.

I tested this on x86_64 and both the softlockup and hardlockup paths
work.

V2:
- cleaned up the Kconfig and softlockup combination
- surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI
- seperated out the softlockup case from perf event subsystem
- re-arranged the enabling/disabling nmi watchdog from proc space
- added cpumasks for hardlockup failure cases
- removed fallback to soft events if no PMU exists for hard events

V3:
- comment cleanups
- drop support for older softlockup code
- per_cpu cleanups
- completely remove software clock base hardlockup detector
- use per_cpu masking on hard/soft lockup detection
- #ifdef cleanups
- rename config option NMI_WATCHDOG to LOCKUP_DETECTOR
- documentation additions

V4:
- documentation fixes
- convert per_cpu to __get_cpu_var
- powerpc compile fixes

V5:
- split apart warn flags for hard and soft lockups

TODO:
- figure out how to make an arch-agnostic clock2cycles call
  (if possible) to feed into perf events as a sample period

[fweisbec: merged conflict patch]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/include/asm/nmi.h    | 2 +-
 arch/x86/kernel/apic/Makefile | 4 ++--
 arch/x86/kernel/apic/hw_nmi.c | 2 +-
 arch/x86/kernel/traps.c       | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5b41b0feb6d..932f0f86b4b 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_NMI_WATCHDOG)
+#if !defined(CONFIG_LOCKUP_DETECTOR)
 extern int nmi_watchdog_enabled;
 #endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 1a4512e48d2..52f32e0ea19 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_NMI_WATCHDOG),y)
+ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index e8b78a0be5d..79425f96fce 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-	return cpu_khz * 1000;
+	return (u64)(cpu_khz) * 1000 * 60;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bdc7fab3ef3..bd347c2b34d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 							== NOTIFY_STOP)
 			return;
 
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_NMI_WATCHDOG */
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
-- 
cgit v1.2.3


From 7cbb7e7fa46f6e5229438ac9e4a5c72ec0d53e0b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:48 -0400
Subject: x86: Move trigger_all_cpu_backtrace to its own die_notifier

As part of the transition of the nmi watchdog to something more
generic, the trigger_all_cpu_backtrace code is getting left behind.
Put it in its own die_notifier so it can still be used.

V2:
- use arch_spin_locks

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-6-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/hw_nmi.c | 65 +++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79425f96fce..8c3edfb89c2 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,6 +17,10 @@
 #include <linux/cpumask.h>
 #include <linux/kernel_stat.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+
 
 #include <linux/nmi.h>
 #include <linux/module.h>
@@ -54,20 +58,6 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	unsigned int sum;
 	int cpu = smp_processor_id();
 
-	/* FIXME: cheap hack for this check, probably should get its own
-	 * die_notifier handler
-	 */
-	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
-
-		spin_lock(&lock);
-		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-		show_regs(regs);
-		dump_stack();
-		spin_unlock(&lock);
-		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-	}
-
 	/* if we are doing an mce, just assume the cpu is not stuck */
 	/* Could check oops_in_progress here too, but it's safer not to */
 	if (mce_in_progress())
@@ -109,6 +99,53 @@ void arch_trigger_all_cpu_backtrace(void)
 		mdelay(1);
 	}
 }
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+	int cpu = smp_processor_id();
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+		arch_spin_lock(&lock);
+		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+		show_regs(regs);
+		dump_stack();
+		arch_spin_unlock(&lock);
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+		return NOTIFY_STOP;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static __read_mostly struct notifier_block backtrace_notifier = {
+	.notifier_call          = arch_trigger_all_cpu_backtrace_handler,
+	.next                   = NULL,
+	.priority               = 1
+};
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+	register_die_notifier(&backtrace_notifier);
+	return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
 #endif
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
-- 
cgit v1.2.3


From 10f9014912a2b1cb59c39cdea777e6d9afa8f17e Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:49 -0400
Subject: x86: Cleanup hw_nmi.c cruft

The design of the hardlockup watchdog has changed and cruft was left
behind in the hw_nmi.c file.  Just remove the code that isn't used
anymore.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-7-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/hw_nmi.c | 58 -------------------------------------------
 1 file changed, 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c3edfb89c2..3b40082f037 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -9,74 +9,16 @@
  *
  */
 
-#include <asm/apic.h>
-#include <linux/smp.h>
 #include <linux/cpumask.h>
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <asm/mce.h>
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
-
-
 #include <linux/nmi.h>
 #include <linux/module.h>
 
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
-
-#if defined(CONFIG_X86_LOCAL_APIC)
-	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-
-	return irqs;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
-	return atomic_read(&mce_entry) > 0;
-#endif
-	return 0;
-}
-
-int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
-{
-	unsigned int sum;
-	int cpu = smp_processor_id();
-
-	/* if we are doing an mce, just assume the cpu is not stuck */
-	/* Could check oops_in_progress here too, but it's safer not to */
-	if (mce_in_progress())
-		return 0;
-
-	/* We determine if the cpu is stuck by checking whether any
-	 * interrupts have happened since we last checked.  Of course
-	 * an nmi storm could create false positives, but the higher
-	 * level logic should account for that
-	 */
-	sum = get_timer_irqs(cpu);
-	if (__get_cpu_var(last_irq_sum) == sum) {
-		return 1;
-	} else {
-		__get_cpu_var(last_irq_sum) = sum;
-		return 0;
-	}
-}
-
 u64 hw_nmi_get_sample_period(void)
 {
 	return (u64)(cpu_khz) * 1000 * 60;
-- 
cgit v1.2.3


From 5e85391b3badd3f0e50ebdd0cafe0202a979f73a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 May 2010 09:12:39 +0200
Subject: x86, watchdog: Fix build error in hw_nmi.c

On some configs the following build error triggers:

 arch/x86/kernel/apic/hw_nmi.c:35: error: 'apic' undeclared (first use in this function)
 arch/x86/kernel/apic/hw_nmi.c:35: error: (Each undeclared identifier is reported only once
 arch/x86/kernel/apic/hw_nmi.c:35: error: for each function it appears in.)

Because asm/apic.h was only included implicitly. Include it explicitly.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 3b40082f037..cefd6942f0e 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -8,6 +8,7 @@
  *  Bits copied from original nmi.c file
  *
  */
+#include <asm/apic.h>
 
 #include <linux/cpumask.h>
 #include <linux/kdebug.h>
-- 
cgit v1.2.3


From c01d4323309a90a298fd81cf3a059ee1b12be2e9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 15 May 2010 22:57:48 +0200
Subject: lockup_detector: Adapt CONFIG_PERF_EVENT_NMI to other archs

CONFIG_PERF_EVENT_NMI is something that need to be enabled from the
arch. This is fine on x86 as PERF_EVENTS is builtin but if other
archs select it, they will need to handle the PERF_EVENTS dependency.

Instead, handle the dependency in the generic layer:

- archs need to tell what they support through HAVE_PERF_EVENTS_NMI
- Enable magically PERF_EVENTS_NMI if we have PERF_EVENTS and
  HAVE_PERF_EVENTS_NMI.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3cb28cd1f55..3cb5bb02172 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -54,7 +54,7 @@ config X86
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
 	select PERF_EVENTS
-	select PERF_EVENTS_NMI
+	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
-- 
cgit v1.2.3


From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 14 May 2010 11:11:21 -0400
Subject: lockup_detector: Cross arch compile fixes

Combining the softlockup and hardlockup code causes watchdog.c
to build even without the hardlockup detection support.

So if an arch, that has the previous and the new nmi watchdog
implementations cohabiting, wants to know if the generic one
is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check.
We need to use CONFIG_HARDLOCKUP_DETECTOR instead.

Fixes:
	kernel/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x449bc): multiple definition of `touch_nmi_watchdog'
	arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <20100514151121.GR15159@redhat.com>
[ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 52f32e0ea19..910f20b457c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
-- 
cgit v1.2.3


From 1dedefd1a066a795a87afca9c0236e1a94de9bf6 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:23 -0700
Subject: x86: detect scattered cpuid features earlier

Some extra CPU features such as ARAT is needed in early boot so
that x86_init function pointers can be set up properly.
http://lkml.org/lkml/2010/5/18/519
At start_kernel() level, this patch moves init_scattered_cpuid_features()
from check_bugs() to setup_arch() -> early_cpu_init() which is earlier than
platform specific x86_init layer setup. Suggested by HPA.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-2-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1c00d0b169..284bf89ddae 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	init_scattered_cpuid_features(c);
 }
 
 static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 
 	get_model_name(c); /* Default name */
 
-	init_scattered_cpuid_features(c);
 	detect_nopl(c);
 }
 
-- 
cgit v1.2.3


From a0c173bd8a3fd0541be8e4ef962170e48d8811c7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:24 -0700
Subject: x86, mrst: add cpu type detection

Medfield is the follow-up of Moorestown, it is treated under the same
HW sub-architecture. However, we do need to know the CPU type in order
for some of the driver to act accordingly.
We also have different optimal clock configuration for each CPU type.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mrst.h | 19 +++++++++++++++++++
 arch/x86/kernel/mrst.c      | 26 ++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62..dc5c8500bfc 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,8 +11,27 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
+extern int mrst_identify_cpu(void);
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum mrst_cpu_type {
+	MRST_CPU_CHIP_LINCROFT = 1,
+	MRST_CPU_CHIP_PENWELL,
+};
+
+enum mrst_timer_options {
+	MRST_TIMER_DEFAULT,
+	MRST_TIMER_APBT_ONLY,
+	MRST_TIMER_LAPIC_APBT,
+};
+
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index e796448f0eb..ceaebeb5866 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -27,6 +27,8 @@
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+static int mrst_cpu_chip;
+
 int sfi_mtimer_num;
 
 struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -216,6 +218,28 @@ static void __init mrst_setup_boot_clock(void)
 		setup_boot_APIC_clock();
 };
 
+int mrst_identify_cpu(void)
+{
+	return mrst_cpu_chip;
+}
+EXPORT_SYMBOL_GPL(mrst_identify_cpu);
+
+void __cpuinit mrst_arch_setup(void)
+{
+	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+		mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	else {
+		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+			boot_cpu_data.x86, boot_cpu_data.x86_model);
+		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	}
+	pr_debug("Moorestown CPU %s identified\n",
+		(mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+		"Lincroft" : "Penwell");
+}
+
 /*
  * Moorestown specific x86_init function overrides and early setup
  * calls.
@@ -230,6 +254,8 @@ void __init x86_mrst_early_setup(void)
 
 	x86_init.irqs.pre_vector_init = x86_init_noop;
 
+	x86_init.oem.arch_setup = mrst_arch_setup;
+
 	x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
 
 	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
-- 
cgit v1.2.3


From a875c01944f0d750eeb1ef3133feceb13f13c4b3 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:25 -0700
Subject: x86, mrst: add more timer config options

Always-on local APIC timer (ARAT) has been introduced to Medfield, along
with the platform APB timers we have more timer configuration options
between Moorestown and Medfield.

This patch adds run-time detection of avaiable timer features so that
we can treat Medfield as a variant of Moorestown and set up the optimal
timer options for each platform. i.e.

Medfield: per cpu always-on local APIC timer
Moorestown: per cpu APB timer

Manual override is possible via cmdline option x86_mrst_timer.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/apb_timer.h |  1 -
 arch/x86/include/asm/mrst.h      |  1 +
 arch/x86/kernel/apb_timer.c      | 37 ++++-------------
 arch/x86/kernel/mrst.c           | 88 ++++++++++++++++++++++++++++------------
 4 files changed, 72 insertions(+), 55 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe57..a69b1ac9eaf 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
 extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
 extern void apbt_setup_secondary_clock(void);
 extern unsigned int boot_cpu_id;
-extern int disable_apbt_percpu;
 
 extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
 extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index dc5c8500bfc..67ad3154577 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -12,6 +12,7 @@
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
 extern int mrst_identify_cpu(void);
+extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
 /*
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d3..8dd77800ff5 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
 
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
+#include <asm/mrst.h>
 
 #define APBT_MASK			CLOCKSOURCE_MASK(32)
 #define APBT_SHIFT			22
-#define APBT_CLOCKEVENT_RATING		150
+#define APBT_CLOCKEVENT_RATING		110
 #define APBT_CLOCKSOURCE_RATING		250
 #define APBT_MIN_DELTA_USEC		200
 
@@ -83,8 +84,6 @@ struct apbt_dev {
 	char name[10];
 };
 
-int disable_apbt_percpu __cpuinitdata;
-
 static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 
 #ifdef CONFIG_SMP
@@ -194,29 +193,6 @@ static struct clock_event_device apbt_clockevent = {
 	.rating		= APBT_CLOCKEVENT_RATING,
 };
 
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp("apbt_only", arg) == 0)
-		disable_apbt_percpu = 0;
-	else if (strcmp("lapic_and_apbt", arg) == 0)
-		disable_apbt_percpu = 1;
-	else {
-		pr_warning("X86 MRST timer option %s not recognised"
-			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-			   arg);
-		return -EINVAL;
-	}
-	return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
 /*
  * start count down from 0xffff_ffff. this is done by toggling the enable bit
  * then load initial load count to ~0.
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
 	adev->num = smp_processor_id();
 	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
 
-	if (disable_apbt_percpu) {
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
 		apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
 		global_clock_event = &adev->evt;
 		printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 static __init int apbt_late_init(void)
 {
-	if (disable_apbt_percpu || !apb_timer_block_enabled)
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+		!apb_timer_block_enabled)
 		return 0;
 	/* This notifier should be called after workqueue is ready */
 	hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
 	int timer_num;
 	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
 
+	BUG_ON(!apbt_virt_address);
+
 	timer_num = adev->num;
 	pr_debug("%s CPU %d timer %d mode=%d\n",
 		 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
 	}
 #ifdef CONFIG_SMP
 	/* kernel cmdline disable apb timer, so we will use lapic timers */
-	if (disable_apbt_percpu) {
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
 		printk(KERN_INFO "apbt: disabled per cpu timer\n");
 		return;
 	}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index ceaebeb5866..636b53bd419 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,6 +25,29 @@
 #include <asm/i8259.h>
 #include <asm/apb_timer.h>
 
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+int mrst_timer_options __cpuinitdata;
+
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
 static int mrst_cpu_chip;
@@ -169,18 +192,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 	return 0;
 }
 
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
-	/* restore default lapic clock if disabled by cmdline */
-	if (disable_apbt_percpu)
-		return setup_secondary_APIC_clock();
-	apbt_setup_secondary_clock();
-}
-
 static unsigned long __init mrst_calibrate_tsc(void)
 {
 	unsigned long flags, fast_calibrate;
@@ -197,6 +208,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
 
 void __init mrst_time_init(void)
 {
+	switch (mrst_timer_options) {
+	case MRST_TIMER_APBT_ONLY:
+		break;
+	case MRST_TIMER_LAPIC_APBT:
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		break;
+	default:
+		if (!boot_cpu_has(X86_FEATURE_ARAT))
+			break;
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		return;
+	}
+	/* we need at least one APB timer */
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	pre_init_apic_IRQ0();
 	apbt_time_init();
@@ -207,17 +233,6 @@ void __init mrst_rtc_init(void)
 	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
 }
 
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
-{
-	pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
-	if (disable_apbt_percpu)
-		setup_boot_APIC_clock();
-};
-
 int mrst_identify_cpu(void)
 {
 	return mrst_cpu_chip;
@@ -250,13 +265,13 @@ void __init x86_mrst_early_setup(void)
 	x86_init.resources.reserve_resources = x86_init_noop;
 
 	x86_init.timers.timer_init = mrst_time_init;
-	x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
 
 	x86_init.irqs.pre_vector_init = x86_init_noop;
 
 	x86_init.oem.arch_setup = mrst_arch_setup;
 
-	x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
 
 	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
 	x86_init.pci.init = pci_mrst_init;
@@ -269,3 +284,26 @@ void __init x86_mrst_early_setup(void)
 	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
 
 }
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp("apbt_only", arg) == 0)
+		mrst_timer_options = MRST_TIMER_APBT_ONLY;
+	else if (strcmp("lapic_and_apbt", arg) == 0)
+		mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+	else {
+		pr_warning("X86 MRST timer option %s not recognised"
+			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+			   arg);
+		return -EINVAL;
+	}
+	return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-- 
cgit v1.2.3


From a75af580bb1fd261bf63cc00e4b324e17ceb15cf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 19 May 2010 13:40:14 -0700
Subject: x86, mrst: make mrst_identify_cpu() an inline returning enum

We have an enum, might as well use it.  While we're at it, make it an
inline... there is really no point in calling a function for this
stuff.

LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/mrst.h |  7 ++++++-
 arch/x86/kernel/mrst.c      | 17 ++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 67ad3154577..1869c18d15c 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
-extern int mrst_identify_cpu(void);
 extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
@@ -27,6 +26,12 @@ enum mrst_cpu_type {
 	MRST_CPU_CHIP_PENWELL,
 };
 
+extern enum mrst_cpu_type __mrst_cpu_chip;
+static enum mrst_cpu_type mrst_identify_cpu(void)
+{
+	return __mrst_cpu_chip;
+}
+
 enum mrst_timer_options {
 	MRST_TIMER_DEFAULT,
 	MRST_TIMER_APBT_ONLY,
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 636b53bd419..967f2686adb 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -50,7 +50,8 @@ int mrst_timer_options __cpuinitdata;
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-static int mrst_cpu_chip;
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
 
 int sfi_mtimer_num;
 
@@ -233,25 +234,19 @@ void __init mrst_rtc_init(void)
 	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
 }
 
-int mrst_identify_cpu(void)
-{
-	return mrst_cpu_chip;
-}
-EXPORT_SYMBOL_GPL(mrst_identify_cpu);
-
 void __cpuinit mrst_arch_setup(void)
 {
 	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-		mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
 	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
 	else {
 		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
 			boot_cpu_data.x86, boot_cpu_data.x86_model);
-		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
 	}
 	pr_debug("Moorestown CPU %s identified\n",
-		(mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+		(__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
 		"Lincroft" : "Penwell");
 }
 
-- 
cgit v1.2.3


From 14671386dcbafb3086bbda3cb6f9f27d34c7bf6d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 19 May 2010 14:37:40 -0700
Subject: x86, mrst: make mrst_timer_options an enum

We have an enum mrst_timer_options, use it so that the kernel knows if
we're missing something from a switch statement or equivalent.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/mrst.h | 3 ++-
 arch/x86/kernel/mrst.c      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 1869c18d15c..16350740edf 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
-extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
 /*
@@ -38,6 +37,8 @@ enum mrst_timer_options {
 	MRST_TIMER_LAPIC_APBT,
 };
 
+extern enum mrst_timer_options mrst_timer_options;
+
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 967f2686adb..7ee4ed901ba 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -46,7 +46,7 @@
  * lapic (always-on,ARAT) ------ 150
  */
 
-int mrst_timer_options __cpuinitdata;
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-- 
cgit v1.2.3


From c0011dbfce69467b23b08fb4a64c39a409a935fb Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 4 Feb 2010 14:46:34 -0800
Subject: xen: use _PAGE_IOMAP in ioremap to do machine mappings

In a Xen domain, ioremap operates on machine addresses, not
pseudo-physical addresses.  We use _PAGE_IOMAP to determine whether a
mapping is intended for machine addresses.

[ Impact: allow Xen domain to map real hardware ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/page.h |  8 ++---
 arch/x86/xen/mmu.c              | 71 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a40079..bf5f7d32bd0 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  */
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
-	extern unsigned long max_mapnr;
 	unsigned long pfn = mfn_to_pfn(mfn);
-	if ((pfn < max_mapnr)
-	    && !xen_feature(XENFEAT_auto_translated_physmap)
-	    && (get_phys_to_machine(pfn) != mfn))
-		return max_mapnr; /* force !pfn_valid() */
-	/* XXX fixme; not true with sparsemem */
+	if (get_phys_to_machine(pfn) != mfn)
+		return -1; /* force !pfn_valid() */
 	return pfn;
 }
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce..a4dea9df0cc 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -56,9 +56,11 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
+#include <xen/interface/memory.h>
 #include <xen/hvc-console.h>
 
 #include "multicalls.h"
@@ -377,6 +379,28 @@ static bool xen_page_pinned(void *ptr)
 	return PagePinned(page);
 }
 
+static bool xen_iomap_pte(pte_t pte)
+{
+	return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP);
+}
+
+static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+
+	/* ptep might be kmapped when using 32-bit HIGHPTE */
+	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+	u->val = pte_val_ma(pteval);
+
+	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
 static void xen_extend_mmu_update(const struct mmu_update *update)
 {
 	struct multicall_space mcs;
@@ -453,6 +477,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
+	if (xen_iomap_pte(pteval)) {
+		xen_set_iomap_pte(ptep, pteval);
+		goto out;
+	}
+
 	ADD_STATS(set_pte_at, 1);
 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 	ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -523,8 +552,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	return val;
 }
 
+static pteval_t iomap_pte(pteval_t val)
+{
+	if (val & _PAGE_PRESENT) {
+		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & PTE_FLAGS_MASK;
+
+		/* We assume the pte frame number is a MFN, so
+		   just use it as-is. */
+		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+	}
+
+	return val;
+}
+
 pteval_t xen_pte_val(pte_t pte)
 {
+	if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
+		return pte.pte;
+
 	return pte_mfn_to_pfn(pte.pte);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -537,7 +583,11 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
 pte_t xen_make_pte(pteval_t pte)
 {
-	pte = pte_pfn_to_mfn(pte);
+	if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP)))
+		pte = iomap_pte(pte);
+	else
+		pte = pte_pfn_to_mfn(pte);
+
 	return native_make_pte(pte);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -593,6 +643,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
+	if (xen_iomap_pte(pte)) {
+		xen_set_iomap_pte(ptep, pte);
+		return;
+	}
+
 	ADD_STATS(pte_update, 1);
 //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
@@ -609,6 +664,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
 #ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
+	if (xen_iomap_pte(pte)) {
+		xen_set_iomap_pte(ptep, pte);
+		return;
+	}
+
 	set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 
@@ -1811,9 +1871,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 		pte = pfn_pte(phys, prot);
 		break;
 
-	default:
+	case FIX_PARAVIRT_BOOTMAP:
+		/* This is an MFN, but it isn't an IO mapping from the
+		   IO domain */
 		pte = mfn_pte(phys, prot);
 		break;
+
+	default:
+		/* By default, set_fixmap is used for hardware mappings */
+		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+		break;
 	}
 
 	__native_set_fixmap(idx, pte);
-- 
cgit v1.2.3


From 7347b4082e55ac4a673f06a0a0ce25c37273c9ec Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Fri, 19 Feb 2010 13:31:06 -0500
Subject: xen: Allow unprivileged Xen domains to create iomap pages

PV DomU domains are allowed to map hardware MFNs for PCI passthrough,
but are not generally allowed to map raw machine pages.  In particular,
various pieces of code try to map DMI and ACPI tables in the ISA ROM
range.  We disallow _PAGE_IOMAP for those mappings, so that they are
redirected to a set of local zeroed pages we reserve for that purpose.

[ Impact: prevent passthrough of ISA space, as we only allow PCI ]

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  4 ++++
 arch/x86/xen/mmu.c       | 18 +++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..3254e8bc4cd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1145,6 +1145,10 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
+	if (!xen_initial_domain())
+		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+	__supported_pte_mask |= _PAGE_IOMAP;
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a4dea9df0cc..a5577f59416 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -51,6 +51,7 @@
 #include <asm/mmu_context.h>
 #include <asm/setup.h>
 #include <asm/paravirt.h>
+#include <asm/e820.h>
 #include <asm/linkage.h>
 
 #include <asm/xen/hypercall.h>
@@ -381,7 +382,7 @@ static bool xen_page_pinned(void *ptr)
 
 static bool xen_iomap_pte(pte_t pte)
 {
-	return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP);
+	return pte_flags(pte) & _PAGE_IOMAP;
 }
 
 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
@@ -583,10 +584,21 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
 pte_t xen_make_pte(pteval_t pte)
 {
-	if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP)))
+	phys_addr_t addr = (pte & PTE_PFN_MASK);
+
+	/*
+	 * Unprivileged domains are allowed to do IOMAPpings for
+	 * PCI passthrough, but not map ISA space.  The ISA
+	 * mappings are just dummy local mappings to keep other
+	 * parts of the kernel happy.
+	 */
+	if (unlikely(pte & _PAGE_IOMAP) &&
+	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 		pte = iomap_pte(pte);
-	else
+	} else {
+		pte &= ~_PAGE_IOMAP;
 		pte = pte_pfn_to_mfn(pte);
+	}
 
 	return native_make_pte(pte);
 }
-- 
cgit v1.2.3


From 19001c8c5bfa032ed45b10dfe48e355f5df88c61 Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: xen: Rename the balloon lock

* xen_create_contiguous_region needs access to the balloon lock to
  ensure memory doesn't change under its feet, so expose the balloon
  lock
* Change the name of the lock to xen_reservation_lock, to imply it's
  now less-specific usage.

[ Impact: cleanup ]

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a5577f59416..9e0d82fc21e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -70,6 +70,13 @@
 
 #define MMU_UPDATE_HISTO	30
 
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct {
-- 
cgit v1.2.3


From 08bbc9da92f7e44b9c208c6a1adba70c403b255e Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: xen: Add xen_create_contiguous_region

A memory region must be physically contiguous in order to be accessed
through DMA.  This patch adds xen_create_contiguous_region, which
ensures a region of contiguous virtual memory is also physically
contiguous.

Based on Stephen Tweedie's port of the 2.6.18-xen version.

Remove contiguous_bitmap[] as it's no longer needed.

Ported from linux-2.6.18-xen.hg 707:e410857fd83c

[ Impact: add Xen-internal API to make pages phys-contig ]

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9e0d82fc21e..eb51402dd99 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -53,6 +53,7 @@
 #include <asm/paravirt.h>
 #include <asm/e820.h>
 #include <asm/linkage.h>
+#include <asm/page.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -2027,6 +2028,206 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 }
 
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+				unsigned long *in_frames,
+				unsigned long *out_frames)
+{
+	int i;
+	struct multicall_space mcs;
+
+	xen_mc_batch();
+	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+		mcs = __xen_mc_entry(0);
+
+		if (in_frames)
+			in_frames[i] = virt_to_mfn(vaddr);
+
+		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+		if (out_frames)
+			out_frames[i] = virt_to_pfn(vaddr);
+	}
+	xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+				     unsigned long *mfns,
+				     unsigned long first_mfn)
+{
+	unsigned i, limit;
+	unsigned long mfn;
+
+	xen_mc_batch();
+
+	limit = 1u << order;
+	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+		struct multicall_space mcs;
+		unsigned flags;
+
+		mcs = __xen_mc_entry(0);
+		if (mfns)
+			mfn = mfns[i];
+		else
+			mfn = first_mfn + i;
+
+		if (i < (limit - 1))
+			flags = 0;
+		else {
+			if (order == 0)
+				flags = UVMF_INVLPG | UVMF_ALL;
+			else
+				flags = UVMF_TLB_FLUSH | UVMF_ALL;
+		}
+
+		MULTI_update_va_mapping(mcs.mc, vaddr,
+				mfn_pte(mfn, PAGE_KERNEL), flags);
+
+		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+	}
+
+	xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment.  Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+			       unsigned long *pfns_in,
+			       unsigned long extents_out,
+			       unsigned int order_out,
+			       unsigned long *mfns_out,
+			       unsigned int address_bits)
+{
+	long rc;
+	int success;
+
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = extents_in,
+			.extent_order = order_in,
+			.extent_start = pfns_in,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = extents_out,
+			.extent_order = order_out,
+			.extent_start = mfns_out,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	BUG_ON(extents_in << order_in != extents_out << order_out);
+
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == extents_in);
+
+	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+	BUG_ON(success && (rc != 0));
+
+	return success;
+}
+
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+				 unsigned int address_bits)
+{
+	unsigned long *in_frames = discontig_frames, out_frame;
+	unsigned long  flags;
+	int            success;
+
+	/*
+	 * Currently an auto-translated guest will not perform I/O, nor will
+	 * it require PAE page directories below 4GB. Therefore any calls to
+	 * this function are redundant and can be ignored.
+	 */
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (unlikely(order > MAX_CONTIG_ORDER))
+		return -ENOMEM;
+
+	memset((void *) vstart, 0, PAGE_SIZE << order);
+
+	vm_unmap_aliases();
+
+	spin_lock_irqsave(&xen_reservation_lock, flags);
+
+	/* 1. Zap current PTEs, remembering MFNs. */
+	xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+	/* 2. Get a new contiguous memory extent. */
+	out_frame = virt_to_pfn(vstart);
+	success = xen_exchange_memory(1UL << order, 0, in_frames,
+				      1, order, &out_frame,
+				      address_bits);
+
+	/* 3. Map the new extent in place of old pages. */
+	if (success)
+		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+	else
+		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+	spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+	return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+	unsigned long *out_frames = discontig_frames, in_frame;
+	unsigned long  flags;
+	int success;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	if (unlikely(order > MAX_CONTIG_ORDER))
+		return;
+
+	memset((void *) vstart, 0, PAGE_SIZE << order);
+
+	vm_unmap_aliases();
+
+	spin_lock_irqsave(&xen_reservation_lock, flags);
+
+	/* 1. Find start MFN of contiguous extent. */
+	in_frame = virt_to_mfn(vstart);
+
+	/* 2. Zap current PTEs. */
+	xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+	/* 3. Do the exchange for non-contiguous MFNs. */
+	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+					0, out_frames, 0);
+
+	/* 4. Map new pages in place of old pages. */
+	if (success)
+		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+	else
+		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+	spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
-- 
cgit v1.2.3


From e768aee89c687a50e6a2110e30c5cae1fbf0d2da Mon Sep 17 00:00:00 2001
From: Livio Soares <livio@eecg.toronto.edu>
Date: Thu, 3 Jun 2010 15:00:31 -0400
Subject: perf, x86: Small fix to cpuid10_edx

Fixes to 'cpuid10_edx' to comply with Intel documentation.
According to the Intel Manual, Volume 2A, Table 3-12, the cpuid for
architecture performance monitoring returns, in EDX, two pieces of
information:

  1) Number of fixed-function counters (5 bits, not 4)
  2) Width of fixed-function counters (8 bits)

Signed-off-by: Livio Soares <livio@eecg.toronto.edu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e..6ed3ae4f548 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -68,8 +68,9 @@ union cpuid10_eax {
 
 union cpuid10_edx {
 	struct {
-		unsigned int num_counters_fixed:4;
-		unsigned int reserved:28;
+		unsigned int num_counters_fixed:5;
+		unsigned int bit_width_fixed:8;
+		unsigned int reserved:19;
 	} split;
 	unsigned int full;
 };
-- 
cgit v1.2.3


From 12a6611fa16e9c6d2f844fe2175d219c6e9bd95d Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:01 -0500
Subject: x86, UV: Calculate BAU destination timeout

Calculate the Broadcast Assist Unit's destination timeout period from the
values in the relevant MMR's.

Store it in each cpu's per-cpu BAU structure so that a destination
timeout can be differentiated from a 'plugged' situation in which all
software ack resources are already allocated and a timeout is pending.
That case returns an immediate destination error.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zq-RK@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 12 ++++++++++
 arch/x86/kernel/tlb_uv.c         | 51 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index aa558ac0306..458e04c626a 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -49,6 +49,18 @@
 #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+/* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+/* [30:28] URGENCY_7  an index into a table of times */
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
+
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+/* [45:40] BAU - BAU transaction timeout select - a multiplier */
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e..5506836c4a8 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -30,6 +30,19 @@ struct msg_desc {
 	struct bau_payload_queue_entry *va_queue_last;
 };
 
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+		20,
+		160,
+		1280,
+		10240,
+		81920,
+		655360,
+		5242880,
+		167772160
+};
+static int timeout_us;
+
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
 static int uv_bau_max_concurrent __read_mostly;
@@ -423,7 +436,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			 * pending.  In that case hardware returns the
 			 * ERROR that looks like a destination timeout.
 			 */
-			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+			if (cycles_2_us(ttime - bcp->send_message) <
+							timeout_us) {
 				bcp->conseccompletes = 0;
 				return FLUSH_RETRY_PLUGGED;
 			}
@@ -908,12 +922,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 }
 
 static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
 {
 	unsigned long ns;
 	unsigned long long cyc;
 
-	ns = millisec * 1000;
+	ns = microsec * 1000;
 	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
 	return cyc;
 }
@@ -1258,6 +1272,33 @@ static void __init uv_init_uvhub(int uvhub, int vector)
 				      ((apicid << 32) | vector));
 }
 
+/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+	unsigned long mmr_image;
+	int mult1;
+	int mult2;
+	int index;
+	int base;
+	int ret;
+	unsigned long ts_ns;
+
+	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+	base = timeout_base_ns[index];
+	ts_ns = base * mult1 * mult2;
+	ret = ts_ns / 1000;
+	return ret;
+}
+
 /*
  * initialize the bau_control structure for each cpu
  */
@@ -1286,6 +1327,8 @@ static void uv_init_per_cpu(int nuvhubs)
 	};
 	struct uvhub_desc *uvhub_descs;
 
+	timeout_us = calculate_destination_timeout();
+
 	uvhub_descs = (struct uvhub_desc *)
 		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
 	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
@@ -1301,7 +1344,7 @@ static void uv_init_per_cpu(int nuvhubs)
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
 		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = millisec_2_cycles(3);
+		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		/* kludge: assume uv_hub.h is constant */
 		socket = (cpu_physical_id(cpu)>>5)&1;
 		if (socket >= bdp->num_sockets)
-- 
cgit v1.2.3


From e8e5e8a8048006a12d7777a93baebd6e39496101 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:01 -0500
Subject: x86, UV: BAU tunables into a debugfs file

Make the Broadcast Assist Unit driver's nine tuning values variable by
making them accessible through a read/write debugfs file.

The file will normally be mounted as
/sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each
cpu's per-cpu BAU structure.

The patch also does a little name improvement, and corrects the reset of
two destination timeout counters.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zx-Uo@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  53 +++++---
 arch/x86/kernel/tlb_uv.c         | 281 +++++++++++++++++++++++++++++++++------
 2 files changed, 278 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 458e04c626a..e5543c1a80c 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -45,10 +45,14 @@
 #define UV_DESC_BASE_PNODE_SHIFT	49
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
+#define UV_BAU_BASENAME			"sgi_uv/bau_tunables"
+#define UV_BAU_TUNABLES_DIR		"sgi_uv"
+#define UV_BAU_TUNABLES_FILE		"bau_tunables"
+#define WHITESPACE			" \t\n"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
 #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
 /* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
 #define BAU_MISC_CONTROL_MULT_MASK 3
 
@@ -70,25 +74,23 @@
 #define DESC_STATUS_DESTINATION_TIMEOUT	2
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
+#define TIMEOUT_DELAY			10
 /*
- * source side threshholds at which message retries print a warning
- */
-#define SOURCE_TIMEOUT_LIMIT		20
-#define DESTINATION_TIMEOUT_LIMIT	20
-
-/*
- * misc. delays, in microseconds
+ * delay for 'plugged' timeout retries, in microseconds
  */
-#define THROTTLE_DELAY			10
-#define TIMEOUT_DELAY			10
-#define BIOS_TO				1000
-/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+#define PLUGGED_DELAY			10
 
 /*
  * threshholds at which to use IPI to free resources
  */
+/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
 #define PLUGSB4RESET 100
-#define TIMEOUTSB4RESET 100
+/* after this many consecutive timeouts, use IPI to release resources */
+#define TIMEOUTSB4RESET 1
+/* at this number uses of IPI to release resources, giveup the request */
+#define IPI_RESET_LIMIT 1
+/* after this # consecutive successes, bump up the throttle if it was lowered */
+#define COMPLETE_THRESHOLD 5
 
 /*
  * number of entries in the destination side payload queue
@@ -107,6 +109,13 @@
 #define FLUSH_GIVEUP			3
 #define FLUSH_COMPLETE			4
 
+/*
+ * tuning the action when the numalink network is extremely delayed
+ */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
  * If the 'multilevel' flag in the header portion of the descriptor
@@ -323,14 +332,13 @@ struct bau_control {
 	struct bau_control *uvhub_master;
 	struct bau_control *socket_master;
 	unsigned long timeout_interval;
+	unsigned long set_bau_on_time;
 	atomic_t active_descriptor_count;
-	int max_concurrent;
-	int max_concurrent_constant;
-	int retry_message_scans;
 	int plugged_tries;
 	int timeout_tries;
 	int ipi_attempts;
 	int conseccompletes;
+	int set_bau_off;
 	short cpu;
 	short uvhub_cpu;
 	short uvhub;
@@ -343,6 +351,19 @@ struct bau_control {
 	spinlock_t masks_lock;
 	spinlock_t uvhub_lock;
 	spinlock_t queue_lock;
+	/* tunables */
+	int max_bau_concurrent;
+	int max_bau_concurrent_constant;
+	int plugged_delay;
+	int plugsb4reset;
+	int timeoutsb4reset;
+	int ipi_reset_limit;
+	int complete_threshold;
+	int congested_response_us;
+	int congested_reps;
+	int congested_period;
+	cycles_t period_time;
+	long period_requests;
 };
 
 /*
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 5506836c4a8..c8661779c51 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
  */
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
@@ -42,12 +43,22 @@ static int timeout_base_ns[] = {
 		167772160
 };
 static int timeout_us;
+static int nobau;
 
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
 
-static int nobau;
 static int __init setup_nobau(char *arg)
 {
 	nobau = 1;
@@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	cycles_t elapsed;
 	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
 	/*
-	 * Spin here while there are hmaster->max_concurrent or more active
+	 * Spin here while there are hmaster->max_bau_concurrent or more active
 	 * descriptors. This is the per-uvhub 'throttle'.
 	 */
 	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
-			hmaster->max_concurrent)) {
+			hmaster->max_bau_concurrent)) {
 		stat->s_throttles++;
 		do {
 			cpu_relax();
 		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
-			hmaster->max_concurrent));
+			hmaster->max_bau_concurrent));
 	}
 
 	while (hmaster->uvhub_quiesce)
@@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			 * that case hardware immediately returns the ERROR
 			 * that looks like a destination timeout.
 			 */
-			udelay(TIMEOUT_DELAY);
+			udelay(bcp->plugged_delay);
 			bcp->plugged_tries++;
-			if (bcp->plugged_tries >= PLUGSB4RESET) {
+			if (bcp->plugged_tries >= bcp->plugsb4reset) {
 				bcp->plugged_tries = 0;
 				quiesce_local_uvhub(hmaster);
 				spin_lock(&hmaster->queue_lock);
@@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 				stat->s_resets_plug++;
 			}
 		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			hmaster->max_concurrent = 1;
+			hmaster->max_bau_concurrent = 1;
 			bcp->timeout_tries++;
 			udelay(TIMEOUT_DELAY);
-			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+			if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
 				bcp->timeout_tries = 0;
 				quiesce_local_uvhub(hmaster);
 				spin_lock(&hmaster->queue_lock);
@@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 				stat->s_resets_timeout++;
 			}
 		}
-		if (bcp->ipi_attempts >= 3) {
+		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
 			completion_status = FLUSH_GIVEUP;
 			break;
@@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
 
-	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
-	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
-			hmaster->max_concurrent++;
+	bcp->plugged_tries = 0;
+	bcp->timeout_tries = 0;
+
+	if ((completion_status == FLUSH_COMPLETE) &&
+	    (bcp->conseccompletes > bcp->complete_threshold) &&
+	    (hmaster->max_bau_concurrent <
+					hmaster->max_bau_concurrent_constant))
+			hmaster->max_bau_concurrent++;
 
 	/*
 	 * hold any cpu not timing out here; no other cpu currently held by
@@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	atomic_dec(&hmaster->active_descriptor_count);
 
 	/* guard against cycles wrap */
-	if (time2 > time1)
-		stat->s_time += (time2 - time1);
-	else
+	if (time2 > time1) {
+		elapsed = time2 - time1;
+		stat->s_time += elapsed;
+	} else
 		stat->s_requestor--; /* don't count this one */
 	if (completion_status == FLUSH_COMPLETE && try > 1)
 		stat->s_retriesok++;
@@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
 
+	/* kernel was booted 'nobau' */
 	if (nobau)
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
+
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
 	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
@@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->s_resets_plug, stat->s_resets_timeout,
 			   stat->s_giveup, stat->s_stimeout,
 			   stat->s_busy, stat->s_throttles);
+
 		/* destination side statistics */
 		seq_printf(file,
 			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
@@ -985,10 +1006,29 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 	return 0;
 }
 
+/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	char buf[300];
+	int ret;
+
+	ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+		"max_bau_concurrent plugged_delay plugsb4reset",
+		"timeoutsb4reset ipi_reset_limit complete_threshold",
+		"congested_response_us congested_reps congested_period",
+		max_bau_concurrent, plugged_delay, plugsb4reset,
+		timeoutsb4reset, ipi_reset_limit, complete_threshold,
+		congested_response_us, congested_reps, congested_period);
+
+	return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+}
+
 /*
  * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
@@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 	long input_arg;
 	char optstr[64];
 	struct ptc_stats *stat;
-	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
@@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 			stat = &per_cpu(ptcstats, cpu);
 			memset(stat, 0, sizeof(struct ptc_stats));
 		}
-	} else {
-		uv_bau_max_concurrent = input_arg;
-		bcp = &per_cpu(bau_control, smp_processor_id());
-		if (uv_bau_max_concurrent < 1 ||
-		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
-			printk(KERN_DEBUG
-				"Error: BAU max concurrent %d; %d is invalid\n",
-				bcp->max_concurrent, uv_bau_max_concurrent);
-			return -EINVAL;
-		}
-		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
-		       uv_bau_max_concurrent);
-		for_each_present_cpu(cpu) {
-			bcp = &per_cpu(bau_control, cpu);
-			bcp->max_concurrent = uv_bau_max_concurrent;
+	}
+
+	return count;
+}
+
+static int local_atoi(const char *name)
+{
+	int val = 0;
+
+	for (;; name++) {
+		switch (*name) {
+		case '0' ... '9':
+			val = 10*val+(*name-'0');
+			break;
+		default:
+			return val;
 		}
 	}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+				 size_t count, loff_t *data)
+{
+	int cpu;
+	int cnt = 0;
+	int val;
+	char *p;
+	char *q;
+	char instr[64];
+	struct bau_control *bcp;
 
+	if (count == 0 || count > sizeof(instr)-1)
+		return -EINVAL;
+	if (copy_from_user(instr, user, count))
+		return -EFAULT;
+
+	instr[count] = '\0';
+	/* count the fields */
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (; *p; p = q + strspn(q, WHITESPACE)) {
+		q = p + strcspn(p, WHITESPACE);
+		cnt++;
+		if (q == p)
+			break;
+	}
+	if (cnt != 9) {
+		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+		return -EINVAL;
+	}
+
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+		q = p + strcspn(p, WHITESPACE);
+		val = local_atoi(p);
+		switch (cnt) {
+		case 0:
+			if (val == 0) {
+				max_bau_concurrent = MAX_BAU_CONCURRENT;
+				max_bau_concurrent_constant =
+							MAX_BAU_CONCURRENT;
+				continue;
+			}
+			bcp = &per_cpu(bau_control, smp_processor_id());
+			if (val < 1 || val > bcp->cpus_in_uvhub) {
+				printk(KERN_DEBUG
+				"Error: BAU max concurrent %d is invalid\n",
+				val);
+				return -EINVAL;
+			}
+			max_bau_concurrent = val;
+			max_bau_concurrent_constant = val;
+			continue;
+		case 1:
+			if (val == 0)
+				plugged_delay = PLUGGED_DELAY;
+			else
+				plugged_delay = val;
+			continue;
+		case 2:
+			if (val == 0)
+				plugsb4reset = PLUGSB4RESET;
+			else
+				plugsb4reset = val;
+			continue;
+		case 3:
+			if (val == 0)
+				timeoutsb4reset = TIMEOUTSB4RESET;
+			else
+				timeoutsb4reset = val;
+			continue;
+		case 4:
+			if (val == 0)
+				ipi_reset_limit = IPI_RESET_LIMIT;
+			else
+				ipi_reset_limit = val;
+			continue;
+		case 5:
+			if (val == 0)
+				complete_threshold = COMPLETE_THRESHOLD;
+			else
+				complete_threshold = val;
+			continue;
+		case 6:
+			if (val == 0)
+				congested_response_us = CONGESTED_RESPONSE_US;
+			else
+				congested_response_us = val;
+			continue;
+		case 7:
+			if (val == 0)
+				congested_reps = CONGESTED_REPS;
+			else
+				congested_reps = val;
+			continue;
+		case 8:
+			if (val == 0)
+				congested_period = CONGESTED_PERIOD;
+			else
+				congested_period = val;
+			continue;
+		}
+		if (q == p)
+			break;
+	}
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
 	return count;
 }
 
@@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
 	return seq_open(file, &uv_ptc_seq_ops);
 }
 
+static int tunables_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
 static const struct file_operations proc_uv_ptc_operations = {
 	.open		= uv_ptc_proc_open,
 	.read		= seq_read,
@@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = {
 	.release	= seq_release,
 };
 
+static const struct file_operations tunables_fops = {
+	.open		= tunables_open,
+	.read		= tunables_read,
+	.write		= tunables_write,
+};
+
 static int __init uv_ptc_init(void)
 {
 	struct proc_dir_entry *proc_uv_ptc;
@@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void)
 		       UV_PTC_BASENAME);
 		return -EINVAL;
 	}
+
+	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+	if (!tunables_dir) {
+		printk(KERN_ERR "unable to create debugfs directory %s\n",
+		       UV_BAU_TUNABLES_DIR);
+		return -EINVAL;
+	}
+	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+			tunables_dir, NULL, &tunables_fops);
+	if (!tunables_file) {
+		printk(KERN_ERR "unable to create debugfs file %s\n",
+		       UV_BAU_TUNABLES_FILE);
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs)
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
 		spin_lock_init(&bcp->masks_lock);
-		bcp->max_concurrent = uv_bau_max_concurrent;
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
-		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		/* kludge: assume uv_hub.h is constant */
 		socket = (cpu_physical_id(cpu)>>5)&1;
 		if (socket >= bdp->num_sockets)
@@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs)
 		}
 	}
 	kfree(uvhub_descs);
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
 }
 
 /*
@@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	max_bau_concurrent = MAX_BAU_CONCURRENT;
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
@@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void)
 	return 0;
 }
 core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
-- 
cgit v1.2.3


From 50fb55acc5bbe5ee29d0a65262f4ec286b14d156 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Disable BAU on network congestion

The numalink network can become so congested that TLB shootdown
using the Broadcast Assist Unit becomes slower than using IPI's.

In that case, disable the use of the BAU for a period of time.
The period is tunable.  When the period expires the use of the
BAU is re-enabled. A count of these actions is added to the
statistics file.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004a4-0a@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  4 +++
 arch/x86/kernel/tlb_uv.c         | 76 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e5543c1a80c..9b3e750ef2d 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -34,6 +34,7 @@
  */
 
 #define UV_ITEMS_PER_DESCRIPTOR		8
+/* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT		3
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
@@ -338,6 +339,7 @@ struct bau_control {
 	int timeout_tries;
 	int ipi_attempts;
 	int conseccompletes;
+	int baudisabled;
 	int set_bau_off;
 	short cpu;
 	short uvhub_cpu;
@@ -389,6 +391,8 @@ struct ptc_stats {
 	unsigned long s_busy; /* status stayed busy past s/w timer */
 	unsigned long s_throttles; /* waits in throttle */
 	unsigned long s_retry_messages; /* retry broadcasts */
+	unsigned long s_bau_reenabled; /* for bau enable/disable */
+	unsigned long s_bau_disabled; /* for bau enable/disable */
 	/* destination statistics */
 	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
 	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index c8661779c51..dc6a6831275 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -44,6 +44,9 @@ static int timeout_base_ns[] = {
 };
 static int timeout_us;
 static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
 
 /* tunables: */
 static int max_bau_concurrent = MAX_BAU_CONCURRENT;
@@ -519,6 +522,35 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
 	return 1;
 }
 
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+	int tcpu;
+	struct bau_control *tbcp;
+
+	/* let only one cpu do this disabling */
+	spin_lock(&disable_lock);
+	if (!baudisabled && bcp->period_requests &&
+	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+		/* it becomes this cpu's job to turn on the use of the
+		   BAU again */
+		baudisabled = 1;
+		bcp->set_bau_off = 1;
+		bcp->set_bau_on_time = get_cycles() +
+			sec_2_cycles(bcp->congested_period);
+		stat->s_bau_disabled++;
+		for_each_present_cpu(tcpu) {
+			tbcp = &per_cpu(bau_control, tcpu);
+				tbcp->baudisabled = 1;
+		}
+	}
+	spin_unlock(&disable_lock);
+}
+
 /**
  * uv_flush_send_and_wait
  *
@@ -681,6 +713,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	if (time2 > time1) {
 		elapsed = time2 - time1;
 		stat->s_time += elapsed;
+		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+			bcp->period_requests++;
+			bcp->period_time += elapsed;
+			if ((elapsed > congested_cycles) &&
+			    (bcp->period_requests > bcp->congested_reps)) {
+				disable_for_congestion(bcp, stat);
+			}
+		}
 	} else
 		stat->s_requestor--; /* don't count this one */
 	if (completion_status == FLUSH_COMPLETE && try > 1)
@@ -747,12 +787,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
+	struct bau_control *tbcp;
 
 	/* kernel was booted 'nobau' */
 	if (nobau)
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
+	stat = &per_cpu(ptcstats, cpu);
+
+	/* bau was disabled due to slow response */
+	if (bcp->baudisabled) {
+		/* the cpu that disabled it must re-enable it */
+		if (bcp->set_bau_off) {
+			if (get_cycles() >= bcp->set_bau_on_time) {
+				stat->s_bau_reenabled++;
+				baudisabled = 0;
+				for_each_present_cpu(tcpu) {
+					tbcp = &per_cpu(bau_control, tcpu);
+					tbcp->baudisabled = 0;
+					tbcp->period_requests = 0;
+					tbcp->period_time = 0;
+				}
+			}
+		}
+		return cpumask;
+	}
 
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
@@ -793,7 +853,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		else
 			return NULL;
 	}
-	stat = &per_cpu(ptcstats, cpu);
 	stat->s_requestor++;
 	stat->s_ntargcpu += remotes;
 	remotes = bau_uvhub_weight(&bau_desc->distribution);
@@ -973,7 +1032,9 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"sw_ack recv rtime all ");
 		seq_printf(file,
-			"one mult none retry canc nocan reset rcan\n");
+			"one mult none retry canc nocan reset rcan ");
+		seq_printf(file,
+			"disable enable\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
@@ -993,7 +1054,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 		/* destination side statistics */
 		seq_printf(file,
-			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
 			   stat->d_requestee, cycles_2_us(stat->d_time),
@@ -1001,6 +1062,8 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
 			   stat->d_nocanceled, stat->d_resets,
 			   stat->d_rcanceled);
+		seq_printf(file, "%ld %ld\n",
+			stat->s_bau_disabled, stat->s_bau_reenabled);
 	}
 
 	return 0;
@@ -1112,6 +1175,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 		"reset:    number of ipi-style reset requests processed\n");
 		printk(KERN_DEBUG
 		"rcan:     number messages canceled by reset requests\n");
+		printk(KERN_DEBUG
+		"disable:  number times use of the BAU was disabled\n");
+		printk(KERN_DEBUG
+		"enable:   number times use of the BAU was re-enabled\n");
 	} else if (input_arg == -1) {
 		for_each_present_cpu(cpu) {
 			stat = &per_cpu(ptcstats, cpu);
@@ -1568,6 +1635,7 @@ static void uv_init_per_cpu(int nuvhubs)
 	kfree(uvhub_descs);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
+		bcp->baudisabled = 0;
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		bcp->max_bau_concurrent = max_bau_concurrent;
@@ -1609,6 +1677,8 @@ static int __init uv_bau_init(void)
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
+	spin_lock_init(&disable_lock);
+	congested_cycles = microsec_2_cycles(congested_response_us);
 
 	uv_init_per_cpu(nuvhubs);
 
-- 
cgit v1.2.3


From 712157aa703a01f58c7c17452096ab00b774d0a9 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Shorten access to BAU statistics structure

Use a pointer from the per-cpu BAU control structure to the
per-cpu BAU statistics structure.
We nearly always know the first before needing the second.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aB-2k@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 +
 arch/x86/kernel/tlb_uv.c         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b3e750ef2d..6a42d42eb8f 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -332,6 +332,7 @@ struct bau_control {
 	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_control *uvhub_master;
 	struct bau_control *socket_master;
+	struct ptc_stats *statp;
 	unsigned long timeout_interval;
 	unsigned long set_bau_on_time;
 	atomic_t active_descriptor_count;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index dc6a6831275..261b9653cde 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -153,7 +153,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 	struct ptc_stats *stat;
 
 	msg = mdp->msg;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	stat->d_retries++;
 	/*
 	 * cancel any message from msg+1 to the retry itself
@@ -217,7 +217,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
 	 * This must be a normal message, or retry of a normal message
 	 */
 	msg = mdp->msg;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
 		stat->d_alltlb++;
@@ -301,7 +301,7 @@ uv_do_reset(void *ptr)
 
 	bcp = &per_cpu(bau_control, smp_processor_id());
 	rap = (struct reset_args *)ptr;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	stat->d_resets++;
 
 	/*
@@ -419,7 +419,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mask;
 	cycles_t ttime;
 	cycles_t timeout_time;
-	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
 
 	hmaster = bcp->uvhub_master;
@@ -583,7 +583,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	cycles_t time1;
 	cycles_t time2;
 	cycles_t elapsed;
-	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
@@ -794,7 +794,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
-	stat = &per_cpu(ptcstats, cpu);
+	stat = bcp->statp;
 
 	/* bau was disabled due to slow response */
 	if (bcp->baudisabled) {
@@ -903,7 +903,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 
 	time_start = get_cycles();
 	bcp = &per_cpu(bau_control, smp_processor_id());
-	stat = &per_cpu(ptcstats, smp_processor_id());
+	stat = bcp->statp;
 	msgdesc.va_queue_first = bcp->va_queue_first;
 	msgdesc.va_queue_last = bcp->va_queue_last;
 	msg = bcp->bau_msg_head;
@@ -1636,6 +1636,7 @@ static void uv_init_per_cpu(int nuvhubs)
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled = 0;
+		bcp->statp = &per_cpu(ptcstats, cpu);
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		bcp->max_bau_concurrent = max_bau_concurrent;
@@ -1673,7 +1674,6 @@ static int __init uv_bau_init(void)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	max_bau_concurrent = MAX_BAU_CONCURRENT;
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
-- 
cgit v1.2.3


From 4faca1550838708d71f6eea14cdacb0876c3a5a4 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: BAU structure rearranging

Move some structure definitions from the C code to the BAU
header file, and change the organization of that header file a
little.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aI-54@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 90 +++++++++++++++++++++++-----------------
 arch/x86/kernel/tlb_uv.c         | 12 ------
 2 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 6a42d42eb8f..1c8f1e9bf74 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -322,6 +322,57 @@ struct bau_payload_queue_entry {
 	/* bytes 24-31 */
 };
 
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
+
+struct reset_args {
+	int sender;
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+	/* sender statistics */
+	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
+	unsigned long s_requestor; /* number of shootdown requests */
+	unsigned long s_stimeout; /* source side timeouts */
+	unsigned long s_dtimeout; /* destination side timeouts */
+	unsigned long s_time; /* time spent in sending side */
+	unsigned long s_retriesok; /* successful retries */
+	unsigned long s_ntargcpu; /* total number of cpu's targeted */
+	unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
+	unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
+	unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+	unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
+	unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
+	unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
+	unsigned long s_resets_plug; /* ipi-style resets from plug state */
+	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
+	unsigned long s_busy; /* status stayed busy past s/w timer */
+	unsigned long s_throttles; /* waits in throttle */
+	unsigned long s_retry_messages; /* retry broadcasts */
+	unsigned long s_bau_reenabled; /* for bau enable/disable */
+	unsigned long s_bau_disabled; /* for bau enable/disable */
+	/* destination statistics */
+	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
+	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+	unsigned long d_multmsg; /* interrupts with multiple messages */
+	unsigned long d_nomsg; /* interrupts with no message */
+	unsigned long d_time; /* time spent on destination side */
+	unsigned long d_requestee; /* number of messages processed */
+	unsigned long d_retries; /* number of retry messages processed */
+	unsigned long d_canceled; /* number of messages canceled by retries */
+	unsigned long d_nocanceled; /* retries that found nothing to cancel */
+	unsigned long d_resets; /* number of ipi-style requests processed */
+	unsigned long d_rcanceled; /* number of messages canceled by resets */
+};
+
 /*
  * one per-cpu; to locate the software tables
  */
@@ -369,45 +420,6 @@ struct bau_control {
 	long period_requests;
 };
 
-/*
- * This structure is allocated per_cpu for UV TLB shootdown statistics.
- */
-struct ptc_stats {
-	/* sender statistics */
-	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
-	unsigned long s_requestor; /* number of shootdown requests */
-	unsigned long s_stimeout; /* source side timeouts */
-	unsigned long s_dtimeout; /* destination side timeouts */
-	unsigned long s_time; /* time spent in sending side */
-	unsigned long s_retriesok; /* successful retries */
-	unsigned long s_ntargcpu; /* number of cpus targeted */
-	unsigned long s_ntarguvhub; /* number of uvhubs targeted */
-	unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
-	unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
-	unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
-	unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
-	unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
-	unsigned long s_resets_plug; /* ipi-style resets from plug state */
-	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
-	unsigned long s_busy; /* status stayed busy past s/w timer */
-	unsigned long s_throttles; /* waits in throttle */
-	unsigned long s_retry_messages; /* retry broadcasts */
-	unsigned long s_bau_reenabled; /* for bau enable/disable */
-	unsigned long s_bau_disabled; /* for bau enable/disable */
-	/* destination statistics */
-	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
-	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
-	unsigned long d_multmsg; /* interrupts with multiple messages */
-	unsigned long d_nomsg; /* interrupts with no message */
-	unsigned long d_time; /* time spent on destination side */
-	unsigned long d_requestee; /* number of messages processed */
-	unsigned long d_retries; /* number of retry messages processed */
-	unsigned long d_canceled; /* number of messages canceled by retries */
-	unsigned long d_nocanceled; /* retries that found nothing to cancel */
-	unsigned long d_resets; /* number of ipi-style requests processed */
-	unsigned long d_rcanceled; /* number of messages canceled by resets */
-};
-
 static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
 	return constant_test_bit(uvhub, &dstp->bits[0]);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 261b9653cde..d7592903984 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,14 +23,6 @@
 #include <asm/irq_vectors.h>
 #include <asm/timer.h>
 
-struct msg_desc {
-	struct bau_payload_queue_entry *msg;
-	int msg_slot;
-	int sw_ack_slot;
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-};
-
 /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
 static int timeout_base_ns[] = {
 		20,
@@ -79,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
 
-struct reset_args {
-	int sender;
-};
-
 /*
  * Determine the first node on a uvhub. 'Nodes' are used for kernel
  * memory allocation.
-- 
cgit v1.2.3


From 39847e7f3c8198b14102fe7ba4b3a6a1d84bbcfe Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU software acknowledge

Correct the acknowledgment and the reset of a BAU
software-acknowledged message.

A retry message should be testing only for timed-out resources
(mask << 8). (And we delete a log message that might cause
unnecessary concern) The acknowledge MMR is
|--timed-out--|---pending--|,  each is 8 bits.

The IPI-driven reset of software acknowledge resources frees
both timed out and pending resources.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aP-7O@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d7592903984..295a41122da 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -161,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 			slot2 = msg2 - mdp->va_queue_first;
 			mmr = uv_read_local_mmr
 				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = ((msg2->sw_ack_vector << 8) |
-				   msg2->sw_ack_vector);
+			msg_res = msg2->sw_ack_vector;
 			/*
 			 * This is a message retry; clear the resources held
 			 * by the previous message only if they timed out.
 			 * If it has not timed out we have an unexpected
 			 * situation to report.
 			 */
-			if (mmr & (msg_res << 8)) {
+			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
 				/*
 				 * is the resource timed out?
 				 * make everyone ignore the cancelled message.
@@ -179,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 				cancel_count++;
 				uv_write_local_mmr(
 				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << 8) | msg_res);
-			} else
-				printk(KERN_INFO "note bau retry: no effect\n");
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
+			}
 		}
 	}
 	if (!cancel_count)
@@ -317,13 +316,13 @@ uv_do_reset(void *ptr)
 			 */
 			mmr = uv_read_local_mmr
 					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = ((msg->sw_ack_vector << 8) |
-						   msg->sw_ack_vector);
+			msg_res = msg->sw_ack_vector;
 			if (mmr & msg_res) {
 				stat->d_rcanceled++;
 				uv_write_local_mmr(
 				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-							msg_res);
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
 			}
 		}
 	}
-- 
cgit v1.2.3


From a8328ee58c15c9d763a67607a35bb987b38950fa Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU discovery of hubs and sockets

Correct the initialization-time assumption of contigous blade
numbers and of sockets numbered from zero.

There may be hubs present with no cpu's enabled.
There may be disabled sockets such that the active socket is not
number zero.

And assign a 'socket master' by assuming that a socket is a
node. (it is not safe to extract socket number from an apicid)

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aW-9S@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 49 ++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 295a41122da..ab929e97650 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1547,11 +1547,13 @@ calculate_destination_timeout(void)
  */
 static void uv_init_per_cpu(int nuvhubs)
 {
-	int i, j, k;
+	int i;
 	int cpu;
 	int pnode;
 	int uvhub;
 	short socket = 0;
+	unsigned short socket_mask;
+	unsigned int uvhub_mask;
 	struct bau_control *bcp;
 	struct uvhub_desc *bdp;
 	struct socket_desc *sdp;
@@ -1562,7 +1564,7 @@ static void uv_init_per_cpu(int nuvhubs)
 		short cpu_number[16];
 	};
 	struct uvhub_desc {
-		short num_sockets;
+		unsigned short socket_mask;
 		short num_cpus;
 		short uvhub;
 		short pnode;
@@ -1581,43 +1583,54 @@ static void uv_init_per_cpu(int nuvhubs)
 		spin_lock_init(&bcp->masks_lock);
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		uvhub_mask |= (1 << uvhub);
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
-		/* kludge: assume uv_hub.h is constant */
-		socket = (cpu_physical_id(cpu)>>5)&1;
-		if (socket >= bdp->num_sockets)
-			bdp->num_sockets = socket+1;
+		/* kludge: 'assuming' one node per socket, and assuming that
+		   disabling a socket just leaves a gap in node numbers */
+		socket = (cpu_to_node(cpu) & 1);;
+		bdp->socket_mask |= (1 << socket);
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
 	}
-	socket = 0;
-	for_each_possible_blade(uvhub) {
+	uvhub = 0;
+	while (uvhub_mask) {
+		if (!(uvhub_mask & 1))
+			goto nexthub;
 		bdp = &uvhub_descs[uvhub];
-		for (i = 0; i < bdp->num_sockets; i++) {
-			sdp = &bdp->socket[i];
-			for (j = 0; j < sdp->num_cpus; j++) {
-				cpu = sdp->cpu_number[j];
+		socket_mask = bdp->socket_mask;
+		socket = 0;
+		while (socket_mask) {
+			if (!(socket_mask & 1))
+				goto nextsocket;
+			sdp = &bdp->socket[socket];
+			for (i = 0; i < sdp->num_cpus; i++) {
+				cpu = sdp->cpu_number[i];
 				bcp = &per_cpu(bau_control, cpu);
 				bcp->cpu = cpu;
-				if (j == 0) {
+				if (i == 0) {
 					smaster = bcp;
-					if (i == 0)
+					if (socket == 0)
 						hmaster = bcp;
 				}
 				bcp->cpus_in_uvhub = bdp->num_cpus;
 				bcp->cpus_in_socket = sdp->num_cpus;
 				bcp->socket_master = smaster;
+				bcp->uvhub = bdp->uvhub;
 				bcp->uvhub_master = hmaster;
-				for (k = 0; k < DEST_Q_SIZE; k++)
-					bcp->socket_acknowledge_count[k] = 0;
-				bcp->uvhub_cpu =
-				  uv_cpu_hub_info(cpu)->blade_processor_id;
+				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+						blade_processor_id;
 			}
+nextsocket:
 			socket++;
+			socket_mask = (socket_mask >> 1);
 		}
+nexthub:
+		uvhub++;
+		uvhub_mask = (uvhub_mask >> 1);
 	}
 	kfree(uvhub_descs);
 	for_each_present_cpu(cpu) {
-- 
cgit v1.2.3


From 90cc7d944981a6d06b49bb26fde1b490e28c90e5 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Remove BAU check for stay-busy

Remove a faulty assumption that a long running BAU request has
encountered a hardware problem and will never finish.

Numalink congestion can make a request appear to have
encountered such a problem, but it is not safe to cancel the
request.  If such a cancel is done but a reply is later received
we can miss a TLB shootdown.

We depend upon the max_bau_concurrent 'throttle' to prevent the
stay-busy case from happening.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ad-BV@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 -
 arch/x86/kernel/tlb_uv.c         | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 1c8f1e9bf74..c19b870ea58 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -402,7 +402,6 @@ struct bau_control {
 	unsigned short uvhub_quiesce;
 	short socket_acknowledge_count[DEST_Q_SIZE];
 	cycles_t send_message;
-	spinlock_t masks_lock;
 	spinlock_t uvhub_lock;
 	spinlock_t queue_lock;
 	/* tunables */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ab929e97650..dc962b5ac87 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -405,12 +405,10 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mmr;
 	unsigned long mask;
 	cycles_t ttime;
-	cycles_t timeout_time;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
 
 	hmaster = bcp->uvhub_master;
-	timeout_time = get_cycles() + bcp->timeout_interval;
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
@@ -450,26 +448,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			 * descriptor_status is still BUSY
 			 */
 			cpu_relax();
-			relaxes++;
-			if (relaxes >= 10000) {
-				relaxes = 0;
-				if (get_cycles() > timeout_time) {
-					quiesce_local_uvhub(hmaster);
-
-					/* single-thread the register change */
-					spin_lock(&hmaster->masks_lock);
-					mmr = uv_read_local_mmr(mmr_offset);
-					mask = 0UL;
-					mask |= (3UL < right_shift);
-					mask = ~mask;
-					mmr &= mask;
-					uv_write_local_mmr(mmr_offset, mmr);
-					spin_unlock(&hmaster->masks_lock);
-					end_uvhub_quiesce(hmaster);
-					stat->s_busy++;
-					return FLUSH_GIVEUP;
-				}
-			}
 		}
 	}
 	bcp->conseccompletes++;
@@ -1580,7 +1558,6 @@ static void uv_init_per_cpu(int nuvhubs)
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
-		spin_lock_init(&bcp->masks_lock);
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		uvhub_mask |= (1 << uvhub);
-- 
cgit v1.2.3


From 7fba1bcd4844a4a8619a03bf51cabc92aea365a8 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU regular message type

The Broadcast Assist Unit messages have a regular or retry
message type. The regular type was not being set, but needs to
be, because the lack of a message type is sometimes used to
identify an unused entry in the message queue.
Also removing some excess comments.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ak-Dy@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index dc962b5ac87..4cb14dbd7fa 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -580,23 +580,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	}
 	time1 = get_cycles();
 	do {
-		/*
-		 * Every message from any given cpu gets a unique message
-		 * sequence number. But retries use that same number.
-		 * Our message may have timed out at the destination because
-		 * all sw-ack resources are in use and there is a timeout
-		 * pending there.  In that case, our last send never got
-		 * placed into the queue and we need to persist until it
-		 * does.
-		 *
-		 * Make any retry a type MSG_RETRY so that the destination will
-		 * free any resource held by a previous message from this cpu.
-		 */
 		if (try == 0) {
-			/* use message type set by the caller the first time */
+			bau_desc->header.msg_type = MSG_REGULAR;
 			seq_number = bcp->message_number++;
 		} else {
-			/* use RETRY type on all the rest; same sequence */
 			bau_desc->header.msg_type = MSG_RETRY;
 			stat->s_retry_messages++;
 		}
-- 
cgit v1.2.3


From 450a007eebaf430426ea8f89bbc3f287949905b2 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: BAU broadcast to the local hub

Make the Broadcast Assist Unit driver use the BAU for TLB
shootdowns of cpu's on the local uvhub.

It was previously thought that IPI might be faster to the cpu's
on the local hub.  But the IPI operation would have to follow
the completion of the BAU broadcast anyway.  So we broadcast to
the local uvhub in all cases except when the current cpu was the
only local cpu in the mask.

This simplifies uv_flush_send_and_wait() in that it returns
either all shootdowns complete, or none.

Adjust the statistics to account for shootdowns on the local
uvhub.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aq-G7@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |   5 ++
 arch/x86/kernel/tlb_uv.c         | 138 +++++++++++++++------------------------
 2 files changed, 58 insertions(+), 85 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index c19b870ea58..7f6ea611cb7 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -346,6 +346,11 @@ struct ptc_stats {
 	unsigned long s_time; /* time spent in sending side */
 	unsigned long s_retriesok; /* successful retries */
 	unsigned long s_ntargcpu; /* total number of cpu's targeted */
+	unsigned long s_ntargself; /* times the sending cpu was targeted */
+	unsigned long s_ntarglocals; /* targets of cpus on the local blade */
+	unsigned long s_ntargremotes; /* targets of cpus on remote blades */
+	unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+	unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
 	unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
 	unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
 	unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 4cb14dbd7fa..a1615058fad 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -400,10 +400,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mmr_offset, int right_shift, int this_cpu,
 	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int relaxes = 0;
 	unsigned long descriptor_status;
-	unsigned long mmr;
-	unsigned long mask;
 	cycles_t ttime;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
@@ -524,25 +521,19 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
  * The flush_mask contains the cpus the broadcast is to be sent to, plus
  * cpus that are on the local uvhub.
  *
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
  */
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask,
-					     struct bau_control *bcp)
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+			   struct cpumask *flush_mask, struct bau_control *bcp)
 {
 	int right_shift;
-	int uvhub;
-	int bit;
 	int completion_status = 0;
 	int seq_number = 0;
 	long try = 0;
 	int cpu = bcp->uvhub_cpu;
 	int this_cpu = bcp->cpu;
-	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
@@ -552,10 +543,6 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
-	/*
-	 * Spin here while there are hmaster->max_bau_concurrent or more active
-	 * descriptors. This is the per-uvhub 'throttle'.
-	 */
 	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
 			hmaster->max_bau_concurrent)) {
@@ -591,9 +578,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
 			bcp->uvhub_cpu;
 		bcp->send_message = get_cycles();
-
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
 		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
 			right_shift, this_cpu, bcp, smaster, try);
@@ -652,16 +637,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	    (hmaster->max_bau_concurrent <
 					hmaster->max_bau_concurrent_constant))
 			hmaster->max_bau_concurrent++;
-
-	/*
-	 * hold any cpu not timing out here; no other cpu currently held by
-	 * the 'throttle' should enter the activation code
-	 */
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 	atomic_dec(&hmaster->active_descriptor_count);
-
-	/* guard against cycles wrap */
 	if (time2 > time1) {
 		elapsed = time2 - time1;
 		stat->s_time += elapsed;
@@ -674,32 +652,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			}
 		}
 	} else
-		stat->s_requestor--; /* don't count this one */
+		stat->s_requestor--;
 	if (completion_status == FLUSH_COMPLETE && try > 1)
 		stat->s_retriesok++;
 	else if (completion_status == FLUSH_GIVEUP) {
-		/*
-		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the target cpu's, all of which are still in the mask.
-		 */
 		stat->s_giveup++;
-		return flush_mask;
+		return 1;
 	}
-
-	/*
-	 * Success, so clear the remote cpu's from the mask so we don't
-	 * use the IPI method of shootdown on them.
-	 */
-	for_each_cpu(bit, flush_mask) {
-		uvhub = uv_cpu_to_blade_id(bit);
-		if (uvhub == this_uvhub)
-			continue;
-		cpumask_clear_cpu(bit, flush_mask);
-	}
-	if (!cpumask_empty(flush_mask))
-		return flush_mask;
-
-	return NULL;
+	return 0;
 }
 
 /**
@@ -731,10 +691,11 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	int remotes;
 	int tcpu;
 	int uvhub;
 	int locals = 0;
+	int remotes = 0;
+	int hubs = 0;
 	struct bau_desc *bau_desc;
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
@@ -768,54 +729,52 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
-	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 * cpu mask.  All cpus are converted to uvhubs and copied to the
+	 * activation descriptor.
 	 */
 	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
-	/*
-	 * copy cpumask to flush_mask, removing current cpu
-	 * (current cpu should already have been flushed by the caller and
-	 *  should never be returned if we return flush_mask)
-	 */
+	/* don't actually do a shootdown of the local cpu */
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 	if (cpu_isset(cpu, *cpumask))
-		locals++;  /* current cpu was targeted */
+		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-	remotes = 0;
+
+	/* cpu statistics */
 	for_each_cpu(tcpu, flush_mask) {
 		uvhub = uv_cpu_to_blade_id(tcpu);
-		if (uvhub == bcp->uvhub) {
-			locals++;
-			continue;
-		}
 		bau_uvhub_set(uvhub, &bau_desc->distribution);
-		remotes++;
-	}
-	if (remotes == 0) {
-		/*
-		 * No off_hub flushing; return status for local hub.
-		 * Return the caller's mask if all were local (the current
-		 * cpu may be in that mask).
-		 */
-		if (locals)
-			return cpumask;
+		if (uvhub == bcp->uvhub)
+			locals++;
 		else
-			return NULL;
+			remotes++;
 	}
+	if ((locals + remotes) == 0)
+		return NULL;
 	stat->s_requestor++;
-	stat->s_ntargcpu += remotes;
+	stat->s_ntargcpu += remotes + locals;
+	stat->s_ntargremotes += remotes;
+	stat->s_ntarglocals += locals;
 	remotes = bau_uvhub_weight(&bau_desc->distribution);
-	stat->s_ntarguvhub += remotes;
-	if (remotes >= 16)
+
+	/* uvhub statistics */
+	hubs = bau_uvhub_weight(&bau_desc->distribution);
+	if (locals) {
+		stat->s_ntarglocaluvhub++;
+		stat->s_ntargremoteuvhub += (hubs - 1);
+	} else
+		stat->s_ntargremoteuvhub += hubs;
+	stat->s_ntarguvhub += hubs;
+	if (hubs >= 16)
 		stat->s_ntarguvhub16++;
-	else if (remotes >= 8)
+	else if (hubs >= 8)
 		stat->s_ntarguvhub8++;
-	else if (remotes >= 4)
+	else if (hubs >= 4)
 		stat->s_ntarguvhub4++;
-	else if (remotes >= 2)
+	else if (hubs >= 2)
 		stat->s_ntarguvhub2++;
 	else
 		stat->s_ntarguvhub1++;
@@ -824,10 +783,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	bau_desc->payload.sending_cpu = cpu;
 
 	/*
-	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
-	 * the adjusted flush_mask if any cpu's were not messaged.
+	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+	 * or 1 if it gave up and the original cpumask should be returned.
 	 */
-	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+		return NULL;
+	else
+		return cpumask;
 }
 
 /*
@@ -976,9 +938,11 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+			"# cpu sent stime self locals remotes ncpus localhub ");
+		seq_printf(file,
+			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
 		seq_printf(file,
 			"retries rok resetp resett giveup sto bz throt ");
 		seq_printf(file,
@@ -994,10 +958,14 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntargself, stat->s_ntarglocals,
+			   stat->s_ntargremotes, stat->s_ntargcpu,
+			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
+		seq_printf(file, "%ld %ld %ld %ld %ld ",
 			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
 			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-			   stat->s_ntargcpu, stat->s_dtimeout);
+			   stat->s_dtimeout);
 		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
 			   stat->s_retry_messages, stat->s_retriesok,
 			   stat->s_resets_plug, stat->s_resets_timeout,
-- 
cgit v1.2.3


From f6d8a56693426b1f29ff5cafda8be0d65e4e1870 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Modularize BAU send and wait

Streamline the large uv_flush_send_and_wait() function by use of
a couple of helper functions.

And remove some excess comments.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ay-IH@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 -
 arch/x86/kernel/tlb_uv.c         | 82 +++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 7f6ea611cb7..42d412fd8b0 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -75,7 +75,6 @@
 #define DESC_STATUS_DESTINATION_TIMEOUT	2
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
-#define TIMEOUT_DELAY			10
 /*
  * delay for 'plugged' timeout retries, in microseconds
  */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index a1615058fad..abf3c31f14c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -484,6 +484,47 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
 	return 1;
 }
 
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	udelay(bcp->plugged_delay);
+	bcp->plugged_tries++;
+	if (bcp->plugged_tries >= bcp->plugsb4reset) {
+		bcp->plugged_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_plug++;
+	}
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	hmaster->max_bau_concurrent = 1;
+	bcp->timeout_tries++;
+	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+		bcp->timeout_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_timeout++;
+	}
+}
+
 /*
  * Completions are taking a very long time due to a congested numalink
  * network.
@@ -518,7 +559,7 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
  *
  * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
  * cpus that are on the local uvhub.
  *
  * Returns 0 if all flushing represented in the mask was done.
@@ -553,7 +594,6 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			&hmaster->active_descriptor_count,
 			hmaster->max_bau_concurrent));
 	}
-
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 
@@ -584,40 +624,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			right_shift, this_cpu, bcp, smaster, try);
 
 		if (completion_status == FLUSH_RETRY_PLUGGED) {
-			/*
-			 * Our retries may be blocked by all destination swack
-			 * resources being consumed, and a timeout pending. In
-			 * that case hardware immediately returns the ERROR
-			 * that looks like a destination timeout.
-			 */
-			udelay(bcp->plugged_delay);
-			bcp->plugged_tries++;
-			if (bcp->plugged_tries >= bcp->plugsb4reset) {
-				bcp->plugged_tries = 0;
-				quiesce_local_uvhub(hmaster);
-				spin_lock(&hmaster->queue_lock);
-				uv_reset_with_ipi(&bau_desc->distribution,
-							this_cpu);
-				spin_unlock(&hmaster->queue_lock);
-				end_uvhub_quiesce(hmaster);
-				bcp->ipi_attempts++;
-				stat->s_resets_plug++;
-			}
+			destination_plugged(bau_desc, bcp, hmaster, stat);
 		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			hmaster->max_bau_concurrent = 1;
-			bcp->timeout_tries++;
-			udelay(TIMEOUT_DELAY);
-			if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
-				bcp->timeout_tries = 0;
-				quiesce_local_uvhub(hmaster);
-				spin_lock(&hmaster->queue_lock);
-				uv_reset_with_ipi(&bau_desc->distribution,
-								this_cpu);
-				spin_unlock(&hmaster->queue_lock);
-				end_uvhub_quiesce(hmaster);
-				bcp->ipi_attempts++;
-				stat->s_resets_timeout++;
-			}
+			destination_timeout(bau_desc, bcp, hmaster, stat);
 		}
 		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
@@ -628,10 +637,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
 		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-
 	bcp->plugged_tries = 0;
 	bcp->timeout_tries = 0;
-
 	if ((completion_status == FLUSH_COMPLETE) &&
 	    (bcp->conseccompletes > bcp->complete_threshold) &&
 	    (hmaster->max_bau_concurrent <
@@ -740,7 +747,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	bau_desc = bcp->descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
 	/* cpu statistics */
-- 
cgit v1.2.3


From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 19 May 2010 21:35:17 +0200
Subject: x86: Unify dumpstack.h and stacktrace.h

arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h
declare headers of objects that deal with the same topic.
Actually most of the files that include stacktrace.h also include
dumpstack.h

Although dumpstack.h seems more reserved for internals of stack
traces, those are quite often needed to define specialized stack
trace operations. And perf event arch headers are going to need
access to such low level operations anyway. So don't continue to
bother with dumpstack.h as it's not anymore about isolated deep
internals.

v2: fix struct stack_frame definition conflict in sysprof

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Soeren Sandmann <sandmann@daimi.au.dk>
---
 arch/x86/include/asm/stacktrace.h | 52 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c  |  2 --
 arch/x86/kernel/dumpstack.c       |  1 -
 arch/x86/kernel/dumpstack.h       | 56 ---------------------------------------
 arch/x86/kernel/dumpstack_32.c    |  2 --
 arch/x86/kernel/dumpstack_64.c    |  1 -
 arch/x86/kernel/stacktrace.c      |  7 ++---
 7 files changed, 56 insertions(+), 65 deletions(-)
 delete mode 100644 arch/x86/kernel/dumpstack.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad..a957463d3c7 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
 #ifndef _ASM_X86_STACKTRACE_H
 #define _ASM_X86_STACKTRACE_H
 
+#include <linux/uaccess.h>
+
 extern int kstack_depth_to_print;
 
 struct thread_info;
@@ -42,4 +49,49 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data);
 
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+    u32 next_frame;
+    u32 return_address;
+};
+
+static inline unsigned long rewind_frame_pointer(int n)
+{
+	struct stack_frame *frame;
+
+	get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+	while (n--) {
+		if (probe_kernel_address(&frame->next_frame, frame))
+			break;
+	}
+#endif
+
+	return (unsigned long)frame;
+}
+
 #endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c77586061bc..9632fb61e8f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = {
 	.walk_stack		= print_context_stack_bp,
 };
 
-#include "../dumpstack.h"
-
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b..6e8752c1bd5 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
 
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd4..00000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
-    u32 next_frame;
-    u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
-	struct stack_frame *frame;
-
-	get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
-#endif
-
-	return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d9..0f6376ffa2d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
-
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f..57a21f11c79 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
 
 #define N_EXCEPTION_STACKS_END \
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6..ea54d029fe2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
 
-struct stack_frame {
+struct stack_frame_user {
 	const void __user	*next_fp;
 	unsigned long		ret_addr;
 };
 
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = regs->ip;
 
 	while (trace->nr_entries < trace->max_entries) {
-		struct stack_frame frame;
+		struct stack_frame_user frame;
 
 		frame.next_fp = NULL;
 		frame.ret_addr = 0;
-- 
cgit v1.2.3


From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 20 May 2010 07:47:21 +0200
Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller

Drop this argument now that we always want to rewind only to the
state of the first caller.
It means frame pointers are not necessary anymore to reliably get
the source of an event. But this also means we need this helper
to be a macro now, as an inline function is not an option since
we need to know when to provide a default implentation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 13 +++++++++++++
 arch/x86/include/asm/stacktrace.h |  7 ++-----
 arch/x86/kernel/cpu/perf_event.c  | 16 ----------------
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e..02de29830ff 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)		{	\
+	(regs)->ip = (__ip);					\
+	(regs)->bp = caller_frame_pointer();			\
+	(regs)->cs = __KERNEL_CS;				\
+	regs->flags = 0;					\
+}
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index a957463d3c7..2b16a2ad23d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -78,17 +78,14 @@ struct stack_frame_ia32 {
     u32 return_address;
 };
 
-static inline unsigned long rewind_frame_pointer(int n)
+static inline unsigned long caller_frame_pointer(void)
 {
 	struct stack_frame *frame;
 
 	get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
+	frame = frame->next_frame;
 #endif
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9632fb61e8f..2c075fe573d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-	regs->ip = ip;
-	/*
-	 * perf_arch_fetch_caller_regs adds another call, we need to increment
-	 * the skip level
-	 */
-	regs->bp = rewind_frame_pointer(skip + 1);
-	regs->cs = __KERNEL_CS;
-	/*
-	 * We abuse bit 3 to pass exact information, see perf_misc_flags
-	 * and the comment with PERF_EFLAGS_EXACT.
-	 */
-	regs->flags = 0;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
-- 
cgit v1.2.3


From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 17:49:05 +0200
Subject: perf: Cleanup {start,commit,cancel}_txn details

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274803086.5882.1752.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a1..af04c6fa59c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 68aa00ac0a82e9a876c799bf6be7622b8f1c8517 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 3 Jun 2010 01:23:04 +0400
Subject: perf, x86: Make a second write to performance counter if needed

On Netburst PMU we need a second write to a performance counter
due to cpu erratum.

A simple flag test instead of alternative instructions was choosen
because wrmsrl is already a macro and if virtualization is turned
on will need an additional wrapper call which is more expencise.

nb: we should propably switch to jump-labels as only this facility
reach the mainline.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100602212304.GC5264@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 12 +++++++++++-
 arch/x86/kernel/cpu/perf_event_p4.c |  9 +++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index af04c6fa59c..79e199843db 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
 	void		(*quirks)(void);
+	int		perfctr_second_write;
 
 	int		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
@@ -925,8 +926,17 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsrl(hwc->event_base + idx,
+	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+
+	/*
+	 * Due to erratum on certan cpu we need
+	 * a second write to be sure the register
+	 * is updated properly
+	 */
+	if (x86_pmu.perfctr_second_write) {
+		wrmsrl(hwc->event_base + idx,
 			(u64)(-left) & x86_pmu.cntval_mask);
+	}
 
 	perf_event_update_userpage(event);
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d..9286e736a70 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -829,6 +829,15 @@ static __initconst const struct x86_pmu p4_pmu = {
 	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
+	/*
+	 * This handles erratum N15 in intel doc 249199-029,
+	 * the counter may not be updated correctly on write
+	 * so we need a second write operation to do the trick
+	 * (the official workaround didn't work)
+	 *
+	 * the former idea is taken from OProfile code
+	 */
+	.perfctr_second_write	= 1,
 };
 
 static __init int p4_pmu_init(void)
-- 
cgit v1.2.3


From 1996bda2a42480c275656233e631ee0966574be4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:05:13 +0200
Subject: arch: Implement local64_t

On 64bit, local_t is of size long, and thus we make local64_t an alias.
On 32bit, we fall back to atomic64_t. (architecture can provide optimized
32-bit version)

(This new facility is to be used by perf events optimizations.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-arch@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/local64.h | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 arch/x86/include/asm/local64.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 00000000000..36c93b5cc23
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
+#include <asm-generic/local64.h>
-- 
cgit v1.2.3


From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:43:08 +0200
Subject: perf: Convert perf_event to local_t

Since now all modification to event->count (and ->prev_count
and ->period_left) are local to a cpu, change then to local64_t so we
avoid the LOCK'ed ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79e199843db..2d0d2906927 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
 	 * count to the generic event atomically:
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	rdmsrl(hwc->event_base + idx, new_raw_count);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -314,8 +314,8 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	} else {
 		/*
 		 * If we have a PMU initialized but no APIC
@@ -886,7 +886,7 @@ static int
 x86_perf_event_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
@@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 * The hw event starts counting from this event offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
 
-- 
cgit v1.2.3


From 147ec4d2361e355ab32499f739cc24845ceb89da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jun 2010 21:32:39 +0200
Subject: x86: Make save_stack_address() !CONFIG_FRAME_POINTER friendly

If CONFIG_FRAME_POINTER=n, print_context_stack() shouldn't neglect the
non-reliable addresses on stack, this is all we have if dump_trace(bp)
is called with the wrong or zero bp.

For example, /proc/pid/stack doesn't work if CONFIG_FRAME_POINTER=n.

This patch obviously has no effect if CONFIG_FRAME_POINTER=y, otherwise
it reverts 1650743c "x86: don't save unreliable stack trace entries".

Also, remove the unnecessary type-cast.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100603193239.GA31530@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/stacktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index ea54d029fe2..abc321d5587 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -26,8 +26,10 @@ static int save_stack_stack(void *data, char *name)
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
+#endif
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -39,9 +41,11 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
-	struct stack_trace *trace = (struct stack_trace *)data;
+	struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
+#endif
 	if (in_sched_functions(addr))
 		return;
 	if (trace->skip > 0) {
-- 
cgit v1.2.3


From 018378c55b03f88ff513aba4e0e93b8d4a9cf241 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jun 2010 21:32:43 +0200
Subject: x86: Unify save_stack_address() and save_stack_address_nosched()

Cleanup. Factor the common code in save_stack_address() and
save_stack_address_nosched().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100603193243.GA31534@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/stacktrace.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index abc321d5587..b53c525368a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,13 +23,16 @@ static int save_stack_stack(void *data, char *name)
 	return 0;
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
 {
 	struct stack_trace *trace = data;
 #ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
 #endif
+	if (nosched && in_sched_functions(addr))
+		return;
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -38,22 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+	return __save_stack_address(data, addr, reliable, false);
+}
+
 static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
-	struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
-	if (!reliable)
-		return;
-#endif
-	if (in_sched_functions(addr))
-		return;
-	if (trace->skip > 0) {
-		trace->skip--;
-		return;
-	}
-	if (trace->nr_entries < trace->max_entries)
-		trace->entries[trace->nr_entries++] = addr;
+	return __save_stack_address(data, addr, reliable, true);
 }
 
 static const struct stacktrace_ops save_stack_ops = {
-- 
cgit v1.2.3


From d6d4d4205cf4ce4ba13bc320305afbda25303496 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 3 Jun 2010 12:07:46 +0200
Subject: x86, xsave: Cleanup return codes in check_for_xstate()

The places which call check_for_xstate() only care about zero or
non-zero so this patch doesn't change how the code runs, but it's a
cleanup.  The main reason for this patch is that I'm looking for places
which don't return -EFAULT for copy_from_user() failures.

Signed-off-by: Dan Carpenter <error27@gmail.com>
LKML-Reference: <20100603100746.GU5483@bicker>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/kernel/xsave.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24..980149867a1 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
 
 	err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
 			       sizeof(struct _fpx_sw_bytes));
-
 	if (err)
-		return err;
+		return -EFAULT;
 
 	/*
 	 * First Magic check failed.
 	 */
 	if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
-		return -1;
+		return -EINVAL;
 
 	/*
 	 * Check for error scenarios.
@@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
 	if (fx_sw_user->xstate_size < min_xstate_size ||
 	    fx_sw_user->xstate_size > xstate_size ||
 	    fx_sw_user->xstate_size > fx_sw_user->extended_size)
-		return -1;
+		return -EINVAL;
 
 	err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
 					    fx_sw_user->extended_size -
 					    FP_XSTATE_MAGIC2_SIZE));
+	if (err)
+		return err;
 	/*
 	 * Check for the presence of second magic word at the end of memory
 	 * layout. This detects the case where the user just copied the legacy
 	 * fpstate layout with out copying the extended state information
 	 * in the memory layout.
 	 */
-	if (err || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
+	if (magic2 != FP_XSTATE_MAGIC2)
+		return -EFAULT;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 8cc1176e5de534d55cb26ff0cef3fd0d6ad8c3c0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@amd64.org>
Date: Wed, 2 Jun 2010 18:18:40 +0200
Subject: x86, cacheinfo: Carve out L3 cache slot accessors

This is in preparation for disabling L3 cache indices after having
received correctable ECCs in the L3 cache. Now we allow for initial
setting of a disabled index slot (write once) and deny writing new
indices to it after it has been disabled. Also, we deny using both slots
to disable one and the same index.

Userspace can restore the previously disabled indices by rewriting those
sysfs entries when booting.

Cleanup and reorganize code while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100602161840.GI18327@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 108 ++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf..898c2f4eab8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
 	return l3;
 }
 
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+					   int index)
 {
 	int node;
 
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	this_leaf->l3 = l3_caches[node];
 }
 
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
+{
+	unsigned int reg = 0;
+
+	pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+
+	/* check whether this slot is activated already */
+	if (reg & (3UL << 30))
+		return reg & 0xfff;
+
+	return -1;
+}
+
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int slot)
 {
-	struct pci_dev *dev = this_leaf->l3->dev;
-	unsigned int reg = 0;
+	int index;
 
 	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
-	if (!dev)
-		return -EINVAL;
+	index = amd_get_l3_disable_slot(this_leaf->l3, slot);
+	if (index >= 0)
+		return sprintf(buf, "%d\n", index);
 
-	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
-	return sprintf(buf, "0x%08x\n", reg);
+	return sprintf(buf, "FREE\n");
 }
 
 #define SHOW_CACHE_DISABLE(slot)					\
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
 	}
 }
 
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-				   const char *buf, size_t count,
-				   unsigned int slot)
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3:    L3 cache descriptor
+ * @cpu:   A CPU on the node containing the L3 cache
+ * @slot:  slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+			    unsigned long index)
 {
-	struct pci_dev *dev = this_leaf->l3->dev;
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	unsigned long val = 0;
+	int ret = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+	/*
+	 * check whether this slot is already used or
+	 * the index is already disabled
+	 */
+	ret = amd_get_l3_disable_slot(l3, slot);
+	if (ret >= 0)
 		return -EINVAL;
 
+	/*
+	 * check whether the other slot has disabled the
+	 * same index already
+	 */
+	if (index == amd_get_l3_disable_slot(l3, !slot))
+		return -EINVAL;
+
+	/* do not allow writes outside of allowed bits */
+	if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+	    ((index & SUBCACHE_INDEX) > l3->indices))
+		return -EINVAL;
+
+	amd_l3_disable_index(l3, cpu, slot, index);
+
+	return 0;
+}
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+				  const char *buf, size_t count,
+				  unsigned int slot)
+{
+	unsigned long val = 0;
+	int cpu, err = 0;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!dev)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
-	if (strict_strtoul(buf, 10, &val) < 0)
-		return -EINVAL;
+	cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
 
-	/* do not allow writes outside of allowed bits */
-	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
+	if (strict_strtoul(buf, 10, &val) < 0)
 		return -EINVAL;
 
-	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-
+	err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+	if (err) {
+		if (err == -EEXIST)
+			printk(KERN_WARNING "L3 disable slot %d in use!\n",
+					    slot);
+		return err;
+	}
 	return count;
 }
 
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 
 #else	/* CONFIG_CPU_SUP_AMD */
 static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
 #endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(this_leaf, index);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.2.3


From 12d8a961289644d265d8b3e88201878837c3b814 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 2 Jun 2010 20:29:21 +0200
Subject: x86, AMD: Extend support to future families

Extend support to future families, and in particular:

* extend direct mapping split of Tseg SMM area.
* extend K8 flavored alternatives (NOPS).
* rep movs* prefix is fast in ucode.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100602182921.GA21557@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/amd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d..12b9cff047c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		}
 
 	}
-	if (c->x86 == 0x10 || c->x86 == 0x11)
+	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
 	/* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			num_cache_leaves = 3;
 	}
 
-	if (c->x86 >= 0xf && c->x86 <= 0x11)
+	if (c->x86 >= 0xf)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		fam10h_check_enable_mmcfg();
 	}
 
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+	if (c == &boot_cpu_data && c->x86 >= 0xf) {
 		unsigned long long tseg;
 
 		/*
-- 
cgit v1.2.3


From 1f9a0bd4989fd16842ad71fc89240b48ab191446 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:09:08 +0800
Subject: x86, mce: Rename MSR_IA32_MCx_CTL2 value

Rename CMCI_EN to MCI_CTL2_CMCI_EN and CMCI_THRESHOLD_MASK to
MCI_CTL2_CMCI_THRESHOLD_MASK to make naming consistent.

Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275977348.3444.659.camel@yhuang-dev.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h             | 4 ++++
 arch/x86/include/asm/msr-index.h       | 3 ---
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d..82db1d8f064 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
 #define MCM_ADDR_MEM	 3	/* memory address */
 #define MCM_ADDR_GENERIC 7	/* generic */
 
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN		(1ULL << 30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK	0xffffULL
+
 #define MCJ_CTX_MASK		3
 #define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
 #define MCJ_CTX_RANDOM		0    /* inject context: random */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b49d8ca228f..38f66eb5854 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -94,9 +94,6 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
-#define CMCI_EN			(1ULL << 30)
-#define CMCI_THRESHOLD_MASK		0xffffULL
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920..faf7b2919a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,19 @@ static void cmci_discover(int banks, int boot)
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Already owned by someone else? */
-		if (val & CMCI_EN) {
+		if (val & MCI_CTL2_CMCI_EN) {
 			if (test_and_clear_bit(i, owned) && !boot)
 				print_update("SHD", &hdr, i);
 			__clear_bit(i, __get_cpu_var(mce_poll_banks));
 			continue;
 		}
 
-		val |= CMCI_EN | CMCI_THRESHOLD;
+		val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Did the enable bit stick? -- the bank supports CMCI */
-		if (val & CMCI_EN) {
+		if (val & MCI_CTL2_CMCI_EN) {
 			if (!test_and_set_bit(i, owned) && !boot)
 				print_update("CMCI", &hdr, i);
 			__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +155,7 @@ void cmci_clear(void)
 			continue;
 		/* Disable CMCI */
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		__clear_bit(i, __get_cpu_var(mce_banks_owned));
 	}
-- 
cgit v1.2.3


From 3c417588603e5411f29d22a40f3b5ff71529a4f0 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:09:10 +0800
Subject: x86, mce: Fix MSR_IA32_MCI_CTL2 CMCI threshold setup

It is reported that CMCI is not raised when number of corrected error
reaches preset threshold. After inspection, it is found that
MSR_IA32_MCI_CTL2 threshold field is not setup properly. This patch
fixed it.

Value of MCI_CTL2_CMCI_THRESHOLD_MASK is fixed according to x86_64
Software Developer's Manual too.

Reported-by: Shaohui Zheng <shaohui.zheng@intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275977350.3444.660.camel@yhuang-dev.sh.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h             | 2 +-
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 82db1d8f064..c62c13cb978 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
 
 /* CTL2 register defines */
 #define MCI_CTL2_CMCI_EN		(1ULL << 30)
-#define MCI_CTL2_CMCI_THRESHOLD_MASK	0xffffULL
+#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL
 
 #define MCJ_CTX_MASK		3
 #define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index faf7b2919a8..6fcd0936194 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -102,6 +102,7 @@ static void cmci_discover(int banks, int boot)
 			continue;
 		}
 
+		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
 		val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-- 
cgit v1.2.3


From a2d7b0d4852536273b65d16fe179c65184fe5e2d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:35:39 +0800
Subject: x86, mce: Use HW_ERR in MCE handler

Use HW_ERR printk prefix in MCE handler. To make it more explicit that
this is hardware error instead of software error.

Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275978939.3444.668.camel@yhuang-dev.sh.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..094b228c8b0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 static int default_decode_mce(struct notifier_block *nb, unsigned long val,
 			       void *data)
 {
-	pr_emerg("No human readable MCE decoding support on this CPU type.\n");
-	pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+	pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+	pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
 
 	return NOTIFY_STOP;
 }
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
 
 static void print_mce(struct mce *m)
 {
-	pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
 
 	if (m->ip) {
-		pr_emerg("RIP%s %02x:<%016Lx> ",
+		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 				m->cs, m->ip);
 
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
 		pr_cont("\n");
 	}
 
-	pr_emerg("TSC %llx ", m->tsc);
+	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 	if (m->addr)
 		pr_cont("ADDR %llx ", m->addr);
 	if (m->misc)
 		pr_cont("MISC %llx ", m->misc);
 
 	pr_cont("\n");
-	pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
 
 	/*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
 	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 }
 
-static void print_mce_head(void)
-{
-	pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
-	pr_emerg("This is not a software problem!\n");
-}
-
 #define PANIC_TIMEOUT 5 /* 5 seconds */
 
 static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		if (atomic_inc_return(&mce_fake_paniced) > 1)
 			return;
 	}
-	print_mce_head();
 	/* First print corrected ones that are still unlogged */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 			apei_err = apei_write_mce(final);
 	}
 	if (cpu_missing)
-		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
-	print_mce_tail();
+		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
-		printk(KERN_EMERG "Machine check: %s\n", exp);
+		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 	if (!fake_panic) {
 		if (panic_timeout == 0)
 			panic_timeout = mce_panic_timeout;
 		panic(msg);
 	} else
-		printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 }
 
 /* Support code for software error injection */
@@ -1220,7 +1208,7 @@ int mce_notify_irq(void)
 			schedule_work(&mce_trigger_work);
 
 		if (__ratelimit(&ratelimit))
-			printk(KERN_INFO "Machine check events logged\n");
+			pr_info(HW_ERR "Machine check events logged\n");
 
 		return 1;
 	}
-- 
cgit v1.2.3


From ec8c27e04f89a7575ca2c4facb99152e03d6a99c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 30 Apr 2010 06:45:36 -0700
Subject: mce: convert to rcu_dereference_index_check()

The mce processing applies rcu_dereference_check() to integers used as
array indices.  This patch therefore moves mce to the new RCU API
rcu_dereference_index_check() that avoids the sparse processing that
would otherwise result in compiler errors.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..0e78657e29c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
 static DEFINE_MUTEX(mce_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
-	rcu_dereference_check((p), \
+	rcu_dereference_index_check((p), \
 			      rcu_read_lock_sched_held() || \
 			      lockdep_is_held(&mce_read_mutex))
 
-- 
cgit v1.2.3


From 421f91d21ad6f799dc7b489bb33cc560ccc56f98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 11 Jun 2010 12:17:00 +0200
Subject: fix typos concerning "initiali[zs]e"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/apic/apic.c | 2 +-
 arch/x86/kernel/head32.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e0161..192cd7ee35c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -459,7 +459,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
 }
 
 /*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
  * of the boot CPU and register the clock event in the framework.
  */
 static void __cpuinit setup_APIC_timer(void)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e24603739..784360c0625 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
 
 static void __init i386_default_early_setup(void)
 {
-	/* Initilize 32bit specific setup functions */
+	/* Initialize 32bit specific setup functions */
 	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-- 
cgit v1.2.3


From 23016bf0d25d62c45d8b8f61d55b290d704f7a79 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Thu, 3 Jun 2010 23:22:28 -0400
Subject: x86: Look for IA32_ENERGY_PERF_BIAS support

The new IA32_ENERGY_PERF_BIAS MSR allows system software to give
hardware a hint whether OS policy favors more power saving,
or more performance.  This allows the OS to have some influence
on internal hardware power/performance tradeoffs where the OS
has previously had no influence.

The support for this feature is indicated by CPUID.06H.ECX.bit3,
as documented in the Intel Architectures Software Developer's Manual.

This patch discovers support of this feature and displays it
as "epb" in /proc/cpuinfo.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <alpine.LFD.2.00.1006032310160.6669@localhost.localdomain>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/include/asm/msr-index.h           | 2 ++
 arch/x86/kernel/cpu/addon_cpuid_features.c | 1 +
 3 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 46814591438..2a904f4071f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -162,6 +162,7 @@
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b49d8ca228f..e57bc20683d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -238,6 +238,8 @@
 
 #define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
 
+#define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 10fa5684a66..7369b4c2c55 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
 		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006 },
 		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
-- 
cgit v1.2.3


From 8b8f79b927b6b302bb65fb8c56e7a19be5fbdbef Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 13 Jun 2010 23:56:54 +0200
Subject: x86, kmmio/mmiotrace: Fix double free of kmmio_fault_pages

After every iounmap mmiotrace has to free kmmio_fault_pages, but
it can't do it directly, so it defers freeing by RCU.

It usually works, but when mmiotraced code calls ioremap-iounmap
multiple times without sleeping between (so RCU won't kick in
and start freeing) it can be given the same virtual address, so
at every iounmap mmiotrace will schedule the same pages for
release. Obviously it will explode on second free.

Fix it by marking kmmio_fault_pages which are scheduled for
release and not adding them second time.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Tested-by: Marcin Kocielnicki <koriakin@0x04.net>
Tested-by: Shinpei KATO <shinpei@il.is.s.u-tokyo.ac.jp>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Stuart Bennett <stuart@freedesktop.org>
Cc: Marcin Kocielnicki <koriakin@0x04.net>
Cc: nouveau@lists.freedesktop.org
Cc: <stable@kernel.org>
LKML-Reference: <20100613215654.GA3829@joi.lan>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c         | 16 +++++++++++++---
 arch/x86/mm/testmmiotrace.c | 22 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a..e5d5e2ce9f7 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
 	 * Protected by kmmio_lock, when linked into kmmio_page_table.
 	 */
 	int count;
+
+	bool scheduled_for_release;
 };
 
 struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
 	BUG_ON(f->count < 0);
 	if (!f->count) {
 		disarm_kmmio_fault_page(f);
-		f->release_next = *release_list;
-		*release_list = f;
+		if (!f->scheduled_for_release) {
+			f->release_next = *release_list;
+			*release_list = f;
+			f->scheduled_for_release = true;
+		}
 	}
 }
 
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 			prevp = &f->release_next;
 		} else {
 			*prevp = f->release_next;
+			f->release_next = NULL;
+			f->scheduled_for_release = false;
 		}
-		f = f->release_next;
+		f = *prevp;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 	kmmio_count--;
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 
+	if (!release_list)
+		return;
+
 	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
 	if (!drelease) {
 		pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7c..38868adf07e 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
 	iounmap(p);
 }
 
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+	void __iomem *p;
+	int i;
+
+	for (i = 0; i < 10; ++i) {
+		p = ioremap_nocache(mmio_address, PAGE_SIZE);
+		if (p)
+			iounmap(p);
+	}
+
+	/* Force freeing. If it will crash we will know why. */
+	synchronize_rcu();
+}
+
 static int __init init(void)
 {
 	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
 		   "and writing 16 kB of rubbish in there.\n",
 		   size >> 10, mmio_address);
 	do_test(size);
+	do_test_bulk_ioremapping();
 	pr_info("All done.\n");
 	return 0;
 }
-- 
cgit v1.2.3


From c882e0feb937af4e5b991cbd1c81536f37053e86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Sch=C3=B6ne?= <robert.schoene@tu-dresden.de>
Date: Mon, 14 Jun 2010 13:37:20 +0200
Subject: x86, perf: Add power_end event to process_*.c cpu_idle routine

Systems using the idle thread from process_32.c and process_64.c
do not generate power_end events which could be traced using
perf. This patch adds the event generation for such systems.

Signed-off-by: Robert Schoene <robert.schoene@tu-dresden.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1276515440.5441.45.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 4 ++++
 arch/x86/kernel/process_64.c | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..96586c3cbbb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 
+#include <trace/events/power.h>
+
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 /*
@@ -111,6 +113,8 @@ void cpu_idle(void)
 			stop_critical_timings();
 			pm_idle();
 			start_critical_timings();
+
+			trace_power_end(smp_processor_id());
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1..3d9ea531ddd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 
+#include <trace/events/power.h>
+
 asmlinkage extern void ret_from_fork(void);
 
 DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
 			stop_critical_timings();
 			pm_idle();
 			start_critical_timings();
+
+			trace_power_end(smp_processor_id());
+
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
-- 
cgit v1.2.3


From d7a0380dc3e6607d30ccdfc3cfc2ccee0d966716 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 16 Jun 2010 22:30:42 +0200
Subject: x86-64, mm: Initialize VDSO earlier on 64 bits

When initrd is in use and a driver does request_module() in its
module_init (i.e. __initcall or device_initcall), a modprobe process
is created with VDSO mapping. But VDSO is inited even in __initcall,
i.e. on the same level (at the same time), so it may not be inited
yet (link order matters).

Move the VDSO initialization code earlier by switching to something
before rootfs_initcall where initrd is loaded as rootfs. Specifically
to subsys_initcall. Do it for standard 64-bit path (init_vdso_vars)
and for compat (sysenter_setup), just in case people have 32-bit
initrd and ia32 emulation built-in.

i386 (pure 32-bit) is not affected, since sysenter_setup() is called
from check_bugs()->identify_boot_cpu() in start_kernel() before
rest_init()->kernel_thread(kernel_init) where even kernel_init() calls
do_basic_setup()->do_initcalls().

What this patch fixes are early modprobe crashes such as:
Unpacking initramfs...
Freeing initrd memory: 9324k freed
modprobe[368]: segfault at 7fff4429c020 ip 00007fef397e160c \
    sp 00007fff442795c0 error 4 in ld-2.11.2.so[7fef397df000+1f000]

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
LKML-Reference: <1276720242-13365-1-git-send-email-jslaby@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/vdso/vdso32-setup.c | 2 +-
 arch/x86/vdso/vma.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e9200..36df991985b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 #ifdef CONFIG_X86_64
 
-__initcall(sysenter_setup);
+subsys_initcall(sysenter_setup);
 
 #ifdef CONFIG_SYSCTL
 /* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b814..43456ee1769 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -74,7 +74,7 @@ static int __init init_vdso_vars(void)
 	vdso_enabled = 0;
 	return -ENOMEM;
 }
-__initcall(init_vdso_vars);
+subsys_initcall(init_vdso_vars);
 
 struct linux_binprm;
 
-- 
cgit v1.2.3


From 05d0b0889ca9d033a960542af7f8a13b3ad4f630 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Jun 2010 14:36:26 -0700
Subject: x86, vdso: Error out if the vdso contains external references

The vdso is a piece of userspace code which is supposed to be fully
self-contained.  Any external (undefined) reference is an error, and
should be caught at compile time.  This was giving us trouble when
compiling with -Os on gcc 4.5.0, for example (failed inline).

The need to do a buildtime check was pointed out by Andi Kleen.

Reported-by: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <tip-*@vger.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/vdso/Makefile      |  3 ++-
 arch/x86/vdso/checkundef.sh | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100755 arch/x86/vdso/checkundef.sh

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c..4a2afa1bac5 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
 quiet_cmd_vdso = VDSO    $@
       cmd_vdso = $(CC) -nostdlib -o $@ \
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
+		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
 VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 00000000000..490be1c38f9
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+"$nm" "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+    exit 0
+else
+    echo "$file: undefined symbols found" >&2
+    exit 1
+fi
-- 
cgit v1.2.3


From fd699c76552bbfa66631f019be415a87dbb08237 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Fri, 18 Jun 2010 17:46:53 -0400
Subject: x86, olpc: Add support for calling into OpenFirmware

Add support for saving OFW's cif, and later calling into it to run OFW
commands.  OFW remains resident in memory, living within virtual range
0xff800000 - 0xffc00000.  A single page directory entry points to the
pgdir that OFW actually uses, so rather than saving the entire page
table, we grab and install that one entry permanently in the kernel's
page table.

This is currently only used by the OLPC XO.  Note that this particular
calling convention breaks PAE and PAT, and so cannot be used on newer
x86 hardware.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100618174653.7755a39a@dev.queued.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                 |   9 ++++
 arch/x86/include/asm/bootparam.h |  11 ++++-
 arch/x86/include/asm/olpc_ofw.h  |  31 ++++++++++++
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/head_32.S        |   6 +++
 arch/x86/kernel/olpc.c           |  12 ++---
 arch/x86/kernel/olpc_ofw.c       | 104 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup.c          |   6 +++
 8 files changed, 172 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/include/asm/olpc_ofw.h
 create mode 100644 arch/x86/kernel/olpc_ofw.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a6..71c194db2e0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2062,6 +2062,15 @@ config OLPC
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
 
+config OLPC_OPENFIRMWARE
+	bool "Support for OLPC's Open Firmware"
+	depends on !X86_64 && !X86_PAE
+	default y if OLPC
+	help
+	  This option adds support for the implementation of Open Firmware
+	  that is used on the OLPC XO-1 Children's Machine.
+	  If unsure, say N here.
+
 endif # X86_32
 
 config K8_NB
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c71..8e6218550e7 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
 	__u8  table[14];
 };
 
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+	__u32 ofw_magic;	/* OFW signature */
+	__u32 ofw_version;
+	__u32 cif_handler;	/* callback into OFW */
+	__u32 irq_desc_table;
+} __attribute__((packed));
+
 struct efi_info {
 	__u32 efi_loader_signature;
 	__u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
 	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
 	struct sys_desc_table sys_desc_table;		/* 0x0a0 */
-	__u8  _pad4[144];				/* 0x0b0 */
+	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
+	__u8  _pad4[128];				/* 0x0c0 */
 	struct edid_info edid_info;			/* 0x140 */
 	struct efi_info efi_info;			/* 0x1c0 */
 	__u32 alt_mem_k;				/* 0x1e0 */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 00000000000..3e63d857c48
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F	/* aka "OFW " */
+
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+	__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+		void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+#else /* !CONFIG_OLPC_OPENFIRMWARE */
+
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+
+#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b2208372..0925676266b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 
 microcode-y				:= microcode_core.o
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d8..ff4c453e13f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
 	movsl
 1:
 
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+	/* save OFW's pgdir table for later use when calling into OFW */
+	movl %cr3, %eax
+	movl %eax, pa(olpc_ofw_pgd)
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	/* This is can only trip for a broken bootloader... */
 	cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b..156605281f5 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
 #include <asm/geode.h>
 #include <asm/setup.h>
 #include <asm/olpc.h>
-
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
+#include <asm/olpc_ofw.h>
 
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -188,14 +185,15 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
-#ifdef CONFIG_OPEN_FIRMWARE
+#ifdef CONFIG_OLPC_OPENFIRMWARE
 static void __init platform_detect(void)
 {
 	size_t propsize;
 	__be32 rev;
+	void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+	void *res[] = { &propsize };
 
-	if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
-			&propsize) || propsize != 4) {
+	if (olpc_ofw("getprop", args, res) || propsize != 4) {
 		printk(KERN_ERR "ofw: getprop call failed!\n");
 		rev = cpu_to_be32(0);
 	}
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 00000000000..469ee438429
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,104 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+	pgd_t *base, *ofw_pde;
+
+	if (!olpc_ofw_cif)
+		return;
+
+	/* fetch OFW's PDE */
+	base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+	if (!base) {
+		printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+		olpc_ofw_cif = NULL;
+		return;
+	}
+	ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+	/* install OFW's PDE permanently into the kernel's pgtable */
+	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+		void **res)
+{
+	int ofw_args[MAXARGS + 3];
+	unsigned long flags;
+	int ret, i, *p;
+
+	BUG_ON(nr_args + nr_res > MAXARGS);
+
+	if (!olpc_ofw_cif)
+		return -EIO;
+
+	ofw_args[0] = (int)name;
+	ofw_args[1] = nr_args;
+	ofw_args[2] = nr_res;
+
+	p = &ofw_args[3];
+	for (i = 0; i < nr_args; i++, p++)
+		*p = (int)args[i];
+
+	/* call into ofw */
+	spin_lock_irqsave(&ofw_lock, flags);
+	ret = olpc_ofw_cif(ofw_args);
+	spin_unlock_irqrestore(&ofw_lock, flags);
+
+	if (!ret) {
+		for (i = 0; i < nr_res; i++, p++)
+			*((int *)res[i]) = *p;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+	struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+	unsigned long start;
+
+	/* ensure OFW booted us by checking for "OFW " string */
+	if (hdr->ofw_magic != OLPC_OFW_SIG)
+		return;
+
+	olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+	if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+		printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+				(unsigned long)olpc_ofw_cif);
+		olpc_ofw_cif = NULL;
+		return;
+	}
+
+	/* determine where OFW starts in memory */
+	start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+	printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+			(unsigned long)olpc_ofw_cif, (-start) >> 20);
+	reserve_top_address(-start);
+}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd03..b008e788320 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
 
 #include <asm/paravirt.h>
 #include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
 
 #include <asm/percpu.h>
 #include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
+	/* OFW also may relocate the fixmap */
+	olpc_ofw_detect();
+
 	early_trap_init();
 	early_cpu_init();
 	early_ioremap_init();
 
+	setup_olpc_ofw_pgd();
+
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
 	screen_info = boot_params.screen_info;
 	edid_info = boot_params.edid_info;
-- 
cgit v1.2.3


From 75a9cac430a1bd2a5219c74508ca01b0ddfddc9a Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 23 Jun 2010 20:27:00 -0400
Subject: x86, olpc: Add comment about implicit optimization barrier

Signed-off-by: Andres Salomon <dilinger@queued.net>
Cc: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <20100618174653.7755a39a@dev.queued.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/olpc_ofw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 469ee438429..f5d499fbe74 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -35,6 +35,8 @@ void __init setup_olpc_ofw_pgd(void)
 
 	/* install OFW's PDE permanently into the kernel's pgtable */
 	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+	/* implicit optimization barrier here due to uninline function return */
+
 	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
 }
 
-- 
cgit v1.2.3


From 0c4519e825c9e2b6a8310deff8582f8c35bfbba9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Jun 2010 21:21:27 +0200
Subject: x86: Set resume bit before returning from breakpoint exception

Instruction breakpoints trigger before the instruction executes,
and returning back from the breakpoint handler brings us again
to the instruction that breakpointed. This naturally bring to
a breakpoint recursion.

To solve this, x86 has the Resume Bit trick. When the cpu flags
have the RF flag set, the next instruction won't trigger any
instruction breakpoint, and once this instruction is executed,
RF is cleared back.

This let's us jump back to the instruction that triggered the
breakpoint without recursion.

Use this when an instruction breakpoint triggers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/hw_breakpoint.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2f..eaa6ae2a010 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -466,6 +466,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 
 		perf_bp_event(bp, args->regs);
 
+		/*
+		 * Set up resume flag to avoid breakpoint recursion when
+		 * returning back to origin.
+		 */
+		if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+			args->regs->flags |= X86_EFLAGS_RF;
+
 		rcu_read_unlock();
 	}
 	/*
-- 
cgit v1.2.3


From f7809daf64bf119fef70af172db6a0636fa51f92 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Jun 2010 10:00:24 +0200
Subject: x86: Support for instruction breakpoints

Instruction breakpoints need to have a specific length of 0 to
be working. Bring this support but also take care the user is not
trying to set an unsupported length, like a range breakpoint for
example.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/hw_breakpoint.h |  2 +-
 arch/x86/kernel/hw_breakpoint.c      | 44 ++++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6..528a11e8d3e 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
 #include <linux/list.h>
 
 /* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X		0x00
 #define X86_BREAKPOINT_LEN_1		0x40
 #define X86_BREAKPOINT_LEN_2		0x44
 #define X86_BREAKPOINT_LEN_4		0x4c
-#define X86_BREAKPOINT_LEN_EXECUTE	0x40
 
 #ifdef CONFIG_X86_64
 #define X86_BREAKPOINT_LEN_8		0x48
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index eaa6ae2a010..a474ec37c32 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
 {
 	/* Len */
 	switch (x86_len) {
+	case X86_BREAKPOINT_LEN_X:
+		*gen_len = sizeof(long);
+		break;
 	case X86_BREAKPOINT_LEN_1:
 		*gen_len = HW_BREAKPOINT_LEN_1;
 		break;
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
 
 	info->address = bp->attr.bp_addr;
 
+	/* Type */
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_W:
+		info->type = X86_BREAKPOINT_WRITE;
+		break;
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		info->type = X86_BREAKPOINT_RW;
+		break;
+	case HW_BREAKPOINT_X:
+		info->type = X86_BREAKPOINT_EXECUTE;
+		/*
+		 * x86 inst breakpoints need to have a specific undefined len.
+		 * But we still need to check userspace is not trying to setup
+		 * an unsupported length, to get a range breakpoint for example.
+		 */
+		if (bp->attr.bp_len == sizeof(long)) {
+			info->len = X86_BREAKPOINT_LEN_X;
+			return 0;
+		}
+	default:
+		return -EINVAL;
+	}
+
 	/* Len */
 	switch (bp->attr.bp_len) {
 	case HW_BREAKPOINT_LEN_1:
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp)
 		return -EINVAL;
 	}
 
-	/* Type */
-	switch (bp->attr.bp_type) {
-	case HW_BREAKPOINT_W:
-		info->type = X86_BREAKPOINT_WRITE;
-		break;
-	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-		info->type = X86_BREAKPOINT_RW;
-		break;
-	case HW_BREAKPOINT_X:
-		info->type = X86_BREAKPOINT_EXECUTE;
-		break;
-	default:
-		return -EINVAL;
-	}
-
 	return 0;
 }
 /*
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	ret = -EINVAL;
 
 	switch (info->len) {
+	case X86_BREAKPOINT_LEN_X:
+		align = sizeof(long) -1;
+		break;
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
 		break;
-- 
cgit v1.2.3


From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: increase max_active of keventd and kill
 current_is_keventd()

Define WQ_MAX_ACTIVE and create keventd with max_active set to half of
it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1
works concurrently.  Unless some combination can result in dependency
loop longer than max_active, deadlock won't happen and thus it's
unnecessary to check whether current_is_keventd() before trying to
schedule a work.  Kill current_is_keventd().

(Lockdep annotations are broken.  We need lock_map_acquire_read_norecurse())

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/x86/kernel/smpboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d..4d90f376e98 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
-- 
cgit v1.2.3


From 567a9fd86735ccdc897768ed2dacdd5e83a13509 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Tue, 29 Jun 2010 14:53:50 +0900
Subject: kprobes/x86: Fix kprobes to skip prefixes correctly

Fix resume_execution() and is_IF_modifier() to skip x86
instruction prefixes correctly by using x86 instruction
attribute.

Without this fix, resume_execution() can't handle instructions
which have non-REX prefixes (REX prefixes are skipped). This
will cause unexpected kernel panic by hitting bad address when a
kprobe hits on two-byte ret (e.g. "repz ret" generated for
Athlon/K8 optimization), because it just checks "repz" and can't
recognize the "ret" instruction.

These prefixes can be found easily with x86 instruction
attribute. This patch introduces skip_prefixes() and uses it in
resume_execution() and is_IF_modifier() to skip prefixes.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
LKML-Reference: <4C298A6E.8070609@hitachi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 345a4b1fe14..175f85ceace 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
 }
 
 /*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
+ * Skip the prefixes of the instruction.
  */
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
 {
+	insn_attr_t attr;
+
+	attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+	while (inat_is_legacy_prefix(attr)) {
+		insn++;
+		attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+	}
 #ifdef CONFIG_X86_64
-	if ((*insn & 0xf0) == 0x40)
-		return 1;
+	if (inat_is_rex_prefix(attr))
+		insn++;
 #endif
-	return 0;
+	return insn;
 }
 
 /*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
  */
 static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 {
+	/* Skip prefixes */
+	insn = skip_prefixes(insn);
+
 	switch (*insn) {
 	case 0xfa:		/* cli */
 	case 0xfb:		/* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 		return 1;
 	}
 
-	/*
-	 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
-	 * at the next byte instead.. but of course not recurse infinitely
-	 */
-	if (is_REX_prefix(insn))
-		return is_IF_modifier(++insn);
-
 	return 0;
 }
 
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 	unsigned long orig_ip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
 
-	/*skip the REX prefix*/
-	if (is_REX_prefix(insn))
-		insn++;
+	/* Skip prefixes */
+	insn = skip_prefixes(insn);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	switch (*insn) {
-- 
cgit v1.2.3


From ea812ca1b06113597adcd8e70c0f84a413d97544 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 29 Jun 2010 18:38:00 +0000
Subject: x86: Align skb w/ start of cacheline on newer core 2/Xeon Arch

x86 architectures can handle unaligned accesses in hardware, and it has
been shown that unaligned DMA accesses can be expensive on Nehalem
architectures.  As such we should overwrite NET_IP_ALIGN to resolve
this issue.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/system.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index b8fe48ee2ed..b4293fc8b79 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,13 @@ static inline void rdtsc_barrier(void)
 	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
+#ifdef CONFIG_MCORE2
+/*
+ * We handle most unaligned accesses in hardware.  On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN	0
+#endif
 #endif /* _ASM_X86_SYSTEM_H */
-- 
cgit v1.2.3


From 7475271004b66e9c22e1bb28f240a38c5d6fe76e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 1 Jul 2010 13:28:27 +0000
Subject: x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN

This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN.  It is
based on a suggestion from Andi Kleen.  The assumption is that there are
not any x86 cores where unaligned access is really slow, and this change
would allow for a performance improvement to still exist on configurations
that are not necessarily optimized for Core 2.

Cc: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/system.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index b4293fc8b79..1db9bd2281d 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,7 +457,6 @@ static inline void rdtsc_barrier(void)
 	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
-#ifdef CONFIG_MCORE2
 /*
  * We handle most unaligned accesses in hardware.  On the other hand
  * unaligned DMA can be quite expensive on some Nehalem processors.
@@ -465,5 +464,4 @@ static inline void rdtsc_barrier(void)
  * Based on this we disable the IP header alignment in network drivers.
  */
 #define NET_IP_ALIGN	0
-#endif
 #endif /* _ASM_X86_SYSTEM_H */
-- 
cgit v1.2.3


From 39ef13a4ac28aa64cfe1bc36e6e00f1096707a28 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 5 Jul 2010 10:09:29 +0800
Subject: perf, x86: P4 PMU -- redesign cache events

To support cache events we have reserved the low 6 bits in
hw_perf_event::config (which is a part of CCCR register
configuration actually).

These bits represent Replay Event mertic enumerated in
enum P4_PEBS_METRIC. The caller should not care about
which exact bits should be set and how -- the caller
just chooses one P4_PEBS_METRIC entity and puts it into
the config. The kernel will track it and set appropriate
additional MSR registers (metrics) when needed.

The reason for this redesign was the PEBS enable bit, which
should not be set until DS (and PEBS sampling) support will
be implemented properly.

TODO
====

 - PEBS sampling (note it's tricky and works with _one_ counter only
   so for HT machines it will be not that easy to handle both threads)

 - tracking of PEBS registers state, a user might need to turn
   PEBS off completely (ie no PEBS enable, no UOP_tag) but some
   other event may need it, such events clashes and should not
   run simultaneously, at moment we just don't support such events

 - eventually export user space bits in separate header which will
   allow user apps to configure raw events more conveniently.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1278295769.9540.15.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  99 ++++++++++++-----------
 arch/x86/kernel/cpu/perf_event_p4.c  | 147 ++++++++++++++++++++++++++---------
 2 files changed, 163 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06f..def500776b1 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
 #define ARCH_P4_RESERVED_ESCR	(2) /* IQ_ESCR(0,1) not always present */
 #define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
 #define ARCH_P4_MAX_CCCR	(18)
-#define ARCH_P4_MAX_COUNTER	(ARCH_P4_MAX_CCCR / 2)
 
 #define P4_ESCR_EVENT_MASK	0x7e000000U
 #define P4_ESCR_EVENT_SHIFT	25
@@ -71,10 +70,6 @@
 #define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
 
-/* Custom bits in reerved CCCR area */
-#define P4_CCCR_CACHE_OPS_MASK		0x0000003fU
-
-
 /* Non HT mask */
 #define P4_CCCR_MASK				\
 	(P4_CCCR_OVF			|	\
@@ -106,8 +101,7 @@
  * ESCR and CCCR but rather an only packed value should
  * be unpacked and written to a proper addresses
  *
- * the base idea is to pack as much info as
- * possible
+ * the base idea is to pack as much info as possible
  */
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
@@ -130,8 +124,6 @@
 		t;					\
 	})
 
-#define p4_config_unpack_cache_event(v)	(((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
-
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 	return escr;
 }
 
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
 enum P4_EVENTS {
 	P4_EVENT_TC_DELIVER_MODE,
 	P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES {
  * a caller should use P4_ESCR_EMASK_NAME helper to
  * pick the EventMask needed, for example
  *
- *	P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
+ *	P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
  */
 enum P4_ESCR_EMASKS {
 	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS {
 	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
 };
 
-/* P4 PEBS: stale for a while */
-#define P4_PEBS_METRIC_MASK	0x00001fffU
-#define P4_PEBS_UOB_TAG		0x01000000U
-#define P4_PEBS_ENABLE		0x02000000U
-
-/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
-#define P4_PEBS__1stl_cache_load_miss_retired	0x3000001
-#define P4_PEBS__2ndl_cache_load_miss_retired	0x3000002
-#define P4_PEBS__dtlb_load_miss_retired		0x3000004
-#define P4_PEBS__dtlb_store_miss_retired	0x3000004
-#define P4_PEBS__dtlb_all_miss_retired		0x3000004
-#define P4_PEBS__tagged_mispred_branch		0x3018000
-#define P4_PEBS__mob_load_replay_retired	0x3000200
-#define P4_PEBS__split_load_retired		0x3000400
-#define P4_PEBS__split_store_retired		0x3000400
-
-#define P4_VERT__1stl_cache_load_miss_retired	0x0000001
-#define P4_VERT__2ndl_cache_load_miss_retired	0x0000001
-#define P4_VERT__dtlb_load_miss_retired		0x0000001
-#define P4_VERT__dtlb_store_miss_retired	0x0000002
-#define P4_VERT__dtlb_all_miss_retired		0x0000003
-#define P4_VERT__tagged_mispred_branch		0x0000010
-#define P4_VERT__mob_load_replay_retired	0x0000001
-#define P4_VERT__split_load_retired		0x0000001
-#define P4_VERT__split_store_retired		0x0000002
-
-enum P4_CACHE_EVENTS {
-	P4_CACHE__NONE,
-
-	P4_CACHE__1stl_cache_load_miss_retired,
-	P4_CACHE__2ndl_cache_load_miss_retired,
-	P4_CACHE__dtlb_load_miss_retired,
-	P4_CACHE__dtlb_store_miss_retired,
-	P4_CACHE__itlb_reference_hit,
-	P4_CACHE__itlb_reference_miss,
-
-	P4_CACHE__MAX
+/*
+ * P4 PEBS specifics (Replay Event only)
+ *
+ * Format (bits):
+ *   0-6: metric from P4_PEBS_METRIC enum
+ *    7 : reserved
+ *    8 : reserved
+ * 9-11 : reserved
+ *
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE		(1 << 7)
+#define P4_PEBS_CONFIG_UOP_TAG		(1 << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK	0x3f
+#define P4_PEBS_CONFIG_MASK		0xff
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE			0x02000000U
+#define P4_PEBS_ENABLE_UOP_TAG		0x01000000U
+
+#define p4_config_unpack_metric(v)	(((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v)	(((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask)	(p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+	P4_PEBS_METRIC__none,
+
+	P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+	P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+	P4_PEBS_METRIC__dtlb_load_miss_retired,
+	P4_PEBS_METRIC__dtlb_store_miss_retired,
+	P4_PEBS_METRIC__dtlb_all_miss_retired,
+	P4_PEBS_METRIC__tagged_mispred_branch,
+	P4_PEBS_METRIC__mob_load_replay_retired,
+	P4_PEBS_METRIC__split_load_retired,
+	P4_PEBS_METRIC__split_store_retired,
+
+	P4_PEBS_METRIC__max
 };
 
 #endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 9286e736a70..107711bf0ee 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
 	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
-struct p4_cache_event_bind {
+struct p4_pebs_bind {
 	unsigned int metric_pebs;
 	unsigned int metric_vert;
 };
 
-#define P4_GEN_CACHE_EVENT_BIND(name)		\
-	[P4_CACHE__##name] = {			\
-		.metric_pebs = P4_PEBS__##name,	\
-		.metric_vert = P4_VERT__##name,	\
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
+	[P4_PEBS_METRIC__##name] = {				\
+		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
+		.metric_vert = vert,				\
 	}
 
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
-	P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
+	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
+	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
+	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
+	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
+	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
+	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
 };
 
 /*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	},
 };
 
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event)			  \
+#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
 	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
 			    P4_ESCR_EMASK_BIT(event, bit))		| \
-	p4_config_pack_cccr(cache_event					| \
+	p4_config_pack_cccr(metric					| \
 			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
 static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__1stl_cache_load_miss_retired),
+						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__2ndl_cache_load_miss_retired),
+						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
 	},
 },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__dtlb_load_miss_retired),
+						P4_PEBS_METRIC__dtlb_load_miss_retired),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__dtlb_store_miss_retired),
+						P4_PEBS_METRIC__dtlb_store_miss_retired),
 	},
  },
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-						P4_CACHE__itlb_reference_hit),
+						P4_PEBS_METRIC__none),
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-						P4_CACHE__itlb_reference_miss),
+						P4_PEBS_METRIC__none),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
+static int p4_validate_raw_event(struct perf_event *event)
+{
+	unsigned int v;
+
+	/* user data may have out-of-bound event index */
+	v = p4_config_unpack_event(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+		pr_warning("P4 PMU: Unknown event code: %d\n", v);
+		return -EINVAL;
+	}
+
+	/*
+	 * it may have some screwed PEBS bits
+	 */
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+		pr_warning("P4 PMU: PEBS are not supported yet\n");
+		return -EINVAL;
+	}
+	v = p4_config_unpack_metric(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+		pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
 	int rc = 0;
-	unsigned int evnt;
 	u32 escr, cccr;
 
 	/*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
 
 	if (event->attr.type == PERF_TYPE_RAW) {
 
-		/* user data may have out-of-bound event index */
-		evnt = p4_config_unpack_event(event->attr.config);
-		if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
-			rc = -EINVAL;
+		rc = p4_validate_raw_event(event);
+		if (rc)
 			goto out;
-		}
 
 		/*
 		 * We don't control raw events so it's up to the caller
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event)
 		 * on HT machine but allow HT-compatible specifics to be
 		 * passed on)
 		 *
+		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+		 * bits since we keep additional info here (for cache events and etc)
+		 *
 		 * XXX: HT wide things should check perf_paranoid_cpu() &&
 		 *      CAP_SYS_ADMIN
 		 */
 		event->hw.config |= event->attr.config &
 			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+			 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
 	}
 
 	rc = x86_setup_perfctr(event);
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	return overflow;
 }
 
+static void p4_pmu_disable_pebs(void)
+{
+	/*
+	 * FIXME
+	 *
+	 * It's still allowed that two threads setup same cache
+	 * events so we can't simply clear metrics until we knew
+	 * noone is depending on us, so we need kind of counter
+	 * for "ReplayEvent" users.
+	 *
+	 * What is more complex -- RAW events, if user (for some
+	 * reason) will pass some cache event metric with improper
+	 * event opcode -- it's fine from hardware point of view
+	 * but completely nonsence from "meaning" of such action.
+	 *
+	 * So at moment let leave metrics turned on forever -- it's
+	 * ok for now but need to be revisited!
+	 *
+	 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+	 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+	 */
+}
+
 static inline void p4_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void)
 			continue;
 		p4_pmu_disable_event(event);
 	}
+
+	p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+	struct p4_pebs_bind *bind;
+	unsigned int idx;
+
+	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+	idx = p4_config_unpack_metric(config);
+	if (idx == P4_PEBS_METRIC__none)
+		return;
+
+	bind = &p4_pebs_bind_map[idx];
+
+	(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
+	(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
 }
 
 static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	int thread = p4_ht_config_thread(hwc->config);
 	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
 	unsigned int idx = p4_config_unpack_event(hwc->config);
-	unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
 	struct p4_event_bind *bind;
-	struct p4_cache_event_bind *bind_cache;
 	u64 escr_addr, cccr;
 
 	bind = &p4_event_bind_map[idx];
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	cccr = p4_config_unpack_cccr(hwc->config);
 
 	/*
-	 * it could be Cache event so that we need to
-	 * set metrics into additional MSRs
+	 * it could be Cache event so we need to write metrics
+	 * into additional MSRs
 	 */
-	BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
-	if (idx_cache > P4_CACHE__NONE &&
-		idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
-		bind_cache = &p4_cache_event_bind_map[idx_cache];
-		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
-		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
-	}
+	p4_pmu_enable_pebs(hwc->config);
 
 	(void)checking_wrmsrl(escr_addr, escr_conf);
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-- 
cgit v1.2.3


From 8e221b6db4477643fefc885a97ea9889ac733140 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 22 Jun 2010 16:23:37 -0700
Subject: x86: Avoid unnecessary __clear_user() and xrstor in signal handling

fxsave/xsave doesn't touch all the bytes in the memory layout used by
these instructions. Specifically SW reserved (bytes 464..511) fields
in the fxsave frame and the reserved fields in the xsave header.

To present a clean context for the signal handling, just clear these fields
instead of clearing the complete fxsave/xsave memory layout, when we dump these
registers directly to the user signal frame.

Also avoid the call to second xrstor (which inits the state not passed
in the signal frame) in restore_user_xstate() if all the state has already
been restored by the first xrstor.

These changes improve the performance of signal handling(by ~3-5% as measured
by the lat_sig).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1277249017.2847.85.camel@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  |  9 +++++++++
 arch/x86/include/asm/xsave.h | 10 ++++++++++
 arch/x86/kernel/xsave.c      | 12 ++----------
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..0f1cf5d53dd 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -127,6 +127,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
 
+	/*
+	 * Clear the bytes not touched by the fxsave and reserved
+	 * for the SW usage.
+	 */
+	err = __clear_user(&fx->sw_reserved,
+			   sizeof(struct _fpx_sw_bytes));
+	if (unlikely(err))
+		return -EFAULT;
+
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..30dfc81804d 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -59,6 +59,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
 static inline int xsave_user(struct xsave_struct __user *buf)
 {
 	int err;
+
+	/*
+	 * Clear the xsave header first, so that reserved fields are
+	 * initialized to zero.
+	 */
+	err = __clear_user(&buf->xsave_hdr,
+			   sizeof(struct xsave_hdr_struct));
+	if (unlikely(err))
+		return -EFAULT;
+
 	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
 			     "2:\n"
 			     ".section .fixup,\"ax\"\n"
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24..6e73db1b7b4 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -91,14 +91,6 @@ int save_i387_xstate(void __user *buf)
 		return 0;
 
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		/*
-	 	 * Start with clearing the user buffer. This will present a
-	 	 * clean context for the bytes not touched by the fxsave/xsave.
-		 */
-		err = __clear_user(buf, sig_xstate_size);
-		if (err)
-			return err;
-
 		if (use_xsave())
 			err = xsave_user(buf);
 		else
@@ -184,8 +176,8 @@ static int restore_user_xstate(void __user *buf)
 	 * init the state skipped by the user.
 	 */
 	mask = pcntxt_mask & ~mask;
-
-	xrstor_state(init_xstate_buf, mask);
+	if (unlikely(mask))
+		xrstor_state(init_xstate_buf, mask);
 
 	return 0;
 
-- 
cgit v1.2.3


From 24da9c26f3050aee9314ec09930a24c80fe76352 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 7 Jul 2010 10:15:12 -0700
Subject: x86, cpu: Add CPU flags for F16C and RDRND

Add support for the newly documented F16C (16-bit floating point
conversions) and RDRND (RDRAND instruction) CPU feature flags.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cpufeature.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 2a904f4071f..aeb6f3f9b2c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -124,6 +124,8 @@
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
 #define X86_FEATURE_AVX		(4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C	(4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRND	(4*32+30) /* The RDRAND instruction */
 #define X86_FEATURE_HYPERVISOR	(4*32+31) /* Running on a hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-- 
cgit v1.2.3


From 83a7a2ad2a9173dcabc05df0f01d1d85b7ba1c2c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 10 Jun 2010 00:10:43 +0000
Subject: x86, alternatives: Use 16-bit numbers for cpufeature index

We already have cpufeature indicies above 255, so use a 16-bit number
for the alternatives index.  This consumes a padding field and so
doesn't add any size, but it means that abusing the padding field to
create assembly errors on overflow no longer works.  We can retain the
test simply by redirecting it to the .discard section, however.

[ v3: updated to include open-coded locations ]

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative.h |  7 ++++---
 arch/x86/include/asm/cpufeature.h  | 14 ++++++++------
 arch/x86/kernel/entry_32.S         |  2 +-
 arch/x86/lib/clear_page_64.S       |  2 +-
 arch/x86/lib/copy_page_64.S        |  2 +-
 arch/x86/lib/memcpy_64.S           |  2 +-
 arch/x86/lib/memset_64.S           |  2 +-
 7 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a..bc6abb7bc7e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -45,10 +45,9 @@
 struct alt_instr {
 	u8 *instr;		/* original instruction */
 	u8 *replacement;
-	u8  cpuid;		/* cpuid bit set for replacement */
+	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen;	/* length of new instruction, <= instrlen */
-	u8  pad1;
 #ifdef CONFIG_X86_64
 	u32 pad2;
 #endif
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
       _ASM_ALIGN "\n"							\
       _ASM_PTR "661b\n"				/* label           */	\
       _ASM_PTR "663f\n"				/* new instruction */	\
-      "	 .byte " __stringify(feature) "\n"	/* feature bit     */	\
+      "	 .word " __stringify(feature) "\n"	/* feature bit     */	\
       "	 .byte 662b-661b\n"			/* sourcelen       */	\
       "	 .byte 664f-663f\n"			/* replacementlen  */	\
+      ".previous\n"							\
+      ".section .discard,\"aw\",@progbits\n"				\
       "	 .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */	\
       ".previous\n"							\
       ".section .altinstr_replacement, \"ax\"\n"			\
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 46814591438..e8b88967de3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -291,7 +291,7 @@ extern const char * const x86_power_flags[32];
  * patch the target code for additional performance.
  *
  */
-static __always_inline __pure bool __static_cpu_has(u8 bit)
+static __always_inline __pure bool __static_cpu_has(u16 bit)
 {
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
 		asm goto("1: jmp %l[t_no]\n"
@@ -300,11 +300,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 			 _ASM_ALIGN "\n"
 			 _ASM_PTR "1b\n"
 			 _ASM_PTR "0\n" 	/* no replacement */
-			 " .byte %P0\n"		/* feature bit */
+			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
-			 " .byte 0xff + 0 - (2b-1b)\n"	/* padding */
 			 ".previous\n"
+			 /* skipping size check since replacement size = 0 */
 			 : : "i" (bit) : : t_no);
 		return true;
 	t_no:
@@ -318,10 +318,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 			     _ASM_ALIGN "\n"
 			     _ASM_PTR "1b\n"
 			     _ASM_PTR "3f\n"
-			     " .byte %P1\n"		/* feature bit */
+			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+			     ".previous\n"
+			     ".section .discard,\"aw\",@progbits\n"
+			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
 			     ".previous\n"
 			     ".section .altinstr_replacement,\"ax\"\n"
 			     "3: movb $1,%0\n"
@@ -337,7 +339,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
 		boot_cpu_has(bit) :				\
-	(__builtin_constant_p(bit) && !((bit) & ~0xff)) ?	\
+	__builtin_constant_p(bit) ?				\
 		__static_cpu_has(bit) :				\
 		boot_cpu_has(bit)				\
 )
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..7862cf510ea 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -914,7 +914,7 @@ ENTRY(simd_coprocessor_error)
 	.balign 4
 	.long 661b
 	.long 663f
-	.byte X86_FEATURE_XMM
+	.word X86_FEATURE_XMM
 	.byte 662b-661b
 	.byte 664f-663f
 .previous
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a..aa4326bfb24 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
 	.align 8
 	.quad clear_page
 	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lclear_page_end - clear_page
 	.byte 2b - 1b
 	.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2f..6fec2d1cebe 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
 	.align 8
 	.quad copy_page
 	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lcopy_page_end - copy_page
 	.byte 2b - 1b
 	.previous
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928a..bcbcd1e0f7d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -131,7 +131,7 @@ ENDPROC(__memcpy)
 	.align 8
 	.quad memcpy
 	.quad .Lmemcpy_c
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 
 	/*
 	 * Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644..09d34426965 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
 	.align 8
 	.quad memset
 	.quad .Lmemset_c
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lfinal - memset
 	.byte .Lmemset_e - .Lmemset_c
 	.previous
-- 
cgit v1.2.3


From bdc802dcca1709b01988d57e91f9f35ce1609fcc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 7 Jul 2010 17:29:18 -0700
Subject: x86, cpu: Support the features flags in new CPUID leaf 7

Intel has defined CPUID leaf 7 as the next set of feature flags (see
the AVX specification, version 007).  Add support for this new feature
flags word.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <tip-*@vger.kernel.org>
---
 arch/x86/include/asm/cpufeature.h        | 13 +++++++++----
 arch/x86/include/asm/required-features.h |  2 ++
 arch/x86/kernel/cpu/common.c             | 10 ++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aeb6f3f9b2c..3ec9275cea4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
 
 #include <asm/required-features.h>
 
-#define NCAPINTS	9	/* N 32-bit words worth of info */
+#define NCAPINTS	10	/* N 32-bit words worth of info */
 
 /*
  * Note: If the comment begins with a quoted string, that string is used
@@ -159,14 +159,14 @@
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
+ * CPUID levels like 0x6, 0xA etc, word 7
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 
-/* Virtualization flags: Linux defined */
+/* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        (8*32+ 1) /* Intel Virtual NMI */
 #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
@@ -177,6 +177,9 @@
 #define X86_FEATURE_SVML	(8*32+7)  /* "svm_lock" AMD SVM locking MSR */
 #define X86_FEATURE_NRIPS	(8*32+8)  /* "nrip_save" AMD SVM next_rip save */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE	(9*32+0)  /* {RD/WR}{FS/GS}BASE instructions*/
+
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
@@ -197,7 +200,9 @@ extern const char * const x86_power_flags[32];
 	   (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||	\
 	   (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||	\
 	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
-	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )	\
+	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||	\
+	   (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||	\
+	   (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )	\
 	  ? 1 :								\
 	 test_cpu_cap(c, bit))
 
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad..6c7fc25f2c3 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
 #define REQUIRED_MASK5	0
 #define REQUIRED_MASK6	0
 #define REQUIRED_MASK7	0
+#define REQUIRED_MASK8	0
+#define REQUIRED_MASK9	0
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211..c7358303d8c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[4] = excap;
 	}
 
+	/* Additional Intel-defined flags: level 0x00000007 */
+	if (c->cpuid_level >= 0x00000007) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+		if (eax > 0)
+			c->x86_capability[9] = ebx;
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
-- 
cgit v1.2.3


From 5bbd4a336c81d32df71642abf310cf3d0c98dc9b Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <martinez.javier@gmail.com>
Date: Wed, 7 Jul 2010 19:51:59 -0400
Subject: x86/apic/es7000_32: Remove unused variable

In today's linux-next I got this compile warning:

 arch/x86/kernel/apic/es7000_32.c:132: warning: 'base' defined but not used

Current patch solves the issue removing the unused variable.

Signed-off-by: Javier Martinez Canillas <martinez.javier@gmail.com>
Cc: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <1278546719.9020.4.camel@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/es7000_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87fe..8593582d802 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int					es7000_plat;
  * GSI override for ES7000 platforms.
  */
 
-static unsigned int			base;
 
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
-- 
cgit v1.2.3


From 9279aa55061a280b826bdf9ba5ab5f6a566c1dfb Mon Sep 17 00:00:00 2001
From: Ky Srinivasan <ksrinivasan@novell.com>
Date: Mon, 28 Jun 2010 08:48:55 -0600
Subject: x86: Export the symbol ms_hyperv

This is needed so that the staging hyperv can properly access this
symbol.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpu/mshyperv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b..d944bf6c50e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <asm/mshyperv.h>
 
 struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
 
 static bool __init ms_hyperv_platform(void)
 {
-- 
cgit v1.2.3


From ffa71f33a820d1ab3f2fc5723819ac60fb76080b Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 18 Jun 2010 12:22:40 +0900
Subject: x86, ioremap: Fix incorrect physical address handling in PAE mode

Current x86 ioremap() doesn't handle physical address higher than
32-bit properly in X86_32 PAE mode. When physical address higher than
32-bit is passed to ioremap(), higher 32-bits in physical address is
cleared wrongly. Due to this bug, ioremap() can map wrong address to
linear address space.

In my case, 64-bit MMIO region was assigned to a PCI device (ioat
device) on my system. Because of the ioremap()'s bug, wrong physical
address (instead of MMIO region) was mapped to linear address space.
Because of this, loading ioatdma driver caused unexpected behavior
(kernel panic, kernel hangup, ...).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
LKML-Reference: <4C1AE680.7090408@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/ioremap.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c11..754cb4cbce6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, unsigned long prot_val, void *caller)
 {
-	unsigned long pfn, offset, vaddr;
-	resource_size_t last_addr;
+	unsigned long offset, vaddr;
+	resource_size_t pfn, last_pfn, last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
 	struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	for (pfn = phys_addr >> PAGE_SHIFT;
-				(pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
-				pfn++) {
-
+	last_pfn = last_addr >> PAGE_SHIFT;
+	for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) {
 		int is_ram = page_is_ram(pfn);
 
 		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Mappings have to be page-aligned
 	 */
 	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
+	phys_addr &= PHYSICAL_PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;
 
 	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
-- 
cgit v1.2.3


From 35be1b716a475717611b2dc04185e9d80b9cb693 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 18 Jun 2010 12:23:57 +0900
Subject: x86, ioremap: Fix normal ram range check

Check for normal RAM in x86 ioremap() code seems to not work for the
last page frame in the specified physical address range.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
LKML-Reference: <4C1AE6CD.1080704@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/ioremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 754cb4cbce6..d41d3a9036c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -101,7 +101,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
 	last_pfn = last_addr >> PAGE_SHIFT;
-	for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) {
+	for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
 		int is_ram = page_is_ram(pfn);
 
 		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
-- 
cgit v1.2.3


From fa97bdf92709adaaf8b9a5164a895e262a4fcf60 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 11 Jul 2010 11:06:57 +0300
Subject: x86, setup: Early-boot serial I/O support

This patch adds serial I/O support to the real-mode setup (very early
boot) printf(). It's useful for debugging boot code when running Linux
under KVM, for example. The actual code was lifted from early printk.

Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1278835617-11368-1-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/boot.h   |  16 +++++++
 arch/x86/boot/main.c   |   3 ++
 arch/x86/boot/string.c |  41 ++++++++++++++++++
 arch/x86/boot/tty.c    | 111 ++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 165 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f..46c4c5c71af 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -37,6 +37,8 @@
 extern struct setup_header hdr;
 extern struct boot_params boot_params;
 
+#define cpu_relax()	asm volatile("rep; nop")
+
 /* Basic port I/O */
 static inline void outb(u8 v, u16 port)
 {
@@ -203,6 +205,17 @@ static inline int isdigit(int ch)
 	return (ch >= '0') && (ch <= '9');
 }
 
+static inline int isxdigit(int ch)
+{
+	if (isdigit(ch))
+		return true;
+
+	if ((ch >= 'a') && (ch <= 'f'))
+		return true;
+
+	return (ch >= 'A') && (ch <= 'F');
+}
+
 /* Heap -- available for dynamic lists. */
 extern char _end[];
 extern char *HEAP;
@@ -329,10 +342,13 @@ void initregs(struct biosregs *regs);
 
 /* string.c */
 int strcmp(const char *str1, const char *str2);
+int strncmp(const char *cs, const char *ct, size_t count);
 size_t strnlen(const char *s, size_t maxlen);
 unsigned int atou(const char *s);
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
 
 /* tty.c */
+void console_init(void);
 void puts(const char *);
 void putchar(int);
 int getchar(void);
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895b..4ef1a33e857 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,9 @@ void main(void)
 	/* First, copy the boot header into the "zeropage" */
 	copy_boot_params();
 
+	/* Initialize the early-boot console */
+	console_init();
+
 	/* End of heap check */
 	init_heap();
 
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2ab..aba29df4a7b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
 	return 0;
 }
 
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+	unsigned char c1, c2;
+
+	while (count) {
+		c1 = *cs++;
+		c2 = *ct++;
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		if (!c1)
+			break;
+		count--;
+	}
+	return 0;
+}
+
 size_t strnlen(const char *s, size_t maxlen)
 {
 	const char *es = s;
@@ -48,3 +64,28 @@ unsigned int atou(const char *s)
 		i = i * 10 + (*s++ - '0');
 	return i;
 }
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+	unsigned long long result = 0;
+
+	if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
+		cp += 2;
+
+	while (isxdigit(*cp)) {
+		unsigned int value;
+
+		value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
+		if (value >= base)
+			break;
+		result = result * base + value;
+		cp++;
+	}
+	if (endp)
+		*endp = (char *)cp;
+
+	return result;
+}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c..f3ceee20ff1 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,51 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * Very simple screen I/O
- * XXX: Probably should add very simple serial I/O?
+ * Very simple screen and serial I/O
  */
 
 #include "boot.h"
 
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+static int	early_serial_base;
+
+#define XMTRDY          0x20
+
+#define DLAB		0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+#define DEFAULT_BAUD 9600
+
 /*
  * These functions are in .inittext so they can be used to signal
  * error during initialization.
  */
 
-void __attribute__((section(".inittext"))) putchar(int ch)
+static void __attribute__((section(".inittext"))) serial_putchar(int ch)
 {
-	struct biosregs ireg;
+	unsigned timeout = 0xffff;
 
-	if (ch == '\n')
-		putchar('\r');	/* \n -> \r\n */
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+		cpu_relax();
+
+	outb(ch, early_serial_base + TXR);
+}
+
+static void __attribute__((section(".inittext"))) bios_putchar(int ch)
+{
+	struct biosregs ireg;
 
 	initregs(&ireg);
 	ireg.bx = 0x0007;
@@ -36,6 +64,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
 	intcall(0x10, &ireg, NULL);
 }
 
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+	if (ch == '\n')
+		putchar('\r');	/* \n -> \r\n */
+
+	bios_putchar(ch);
+
+	if (early_serial_base != 0)
+		serial_putchar(ch);
+}
+
 void __attribute__((section(".inittext"))) puts(const char *str)
 {
 	while (*str)
@@ -112,3 +151,63 @@ int getchar_timeout(void)
 
 	return 0;		/* Timeout! */
 }
+
+static void early_serial_init(int baud)
+{
+	unsigned char c;
+	unsigned divisor;
+
+	outb(0x3, early_serial_base + LCR);	/* 8n1 */
+	outb(0, early_serial_base + IER);	/* no interrupt */
+	outb(0, early_serial_base + FCR);	/* no fifo */
+	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+
+	divisor	= 115200 / baud;
+	c = inb(early_serial_base + LCR);
+	outb(c | DLAB, early_serial_base + LCR);
+	outb(divisor & 0xff, early_serial_base + DLL);
+	outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+	outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+void console_init(void)
+{
+	int baud = DEFAULT_BAUD;
+	char arg[32];
+	int pos = 0;
+
+	if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
+		char *e;
+
+		if (!strncmp(arg, "serial", 6)) {
+			early_serial_base = DEFAULT_SERIAL_PORT;
+			pos += 6;
+		}
+
+		if (arg[pos] == ',')
+			pos++;
+
+		if (!strncmp(arg, "ttyS", 4)) {
+			static const int bases[] = { 0x3f8, 0x2f8 };
+			int port = 0;
+
+			if (!strncmp(arg + pos, "ttyS", 4))
+				pos += 4;
+
+			if (arg[pos++] == '1')
+				port = 1;
+
+			early_serial_base = bases[port];
+		}
+
+		if (arg[pos] == ',')
+			pos++;
+
+		baud = simple_strtoull(arg + pos, &e, 0);
+		if (baud == 0 || arg + pos == e)
+			baud = DEFAULT_BAUD;
+	}
+
+	if (early_serial_base != 0)
+		early_serial_init(baud);
+}
-- 
cgit v1.2.3


From ce0aa5dd20e44372f9617dd67c984f41fcdbed88 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 13 Jul 2010 13:35:17 -0700
Subject: x86, setup: Make the setup code also accept console=uart8250

Make the boot code also accept the console=uart8250,io,0x2f8,115200n
form of early console.

Also add back simple_guess_base(), otherwise those simple_strtoull(,,0)
are not going to work.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4C3CCE05.4090505@kernel.org>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/string.c | 22 +++++++++++++++++++
 arch/x86/boot/tty.c    | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index aba29df4a7b..3cbc4058dd2 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -68,10 +68,32 @@ unsigned int atou(const char *s)
 /* Works only for digits and letters, but small and fast */
 #define TOLOWER(x) ((x) | 0x20)
 
+static unsigned int simple_guess_base(const char *cp)
+{
+	if (cp[0] == '0') {
+		if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
+			return 16;
+		else
+			return 8;
+	} else {
+		return 10;
+	}
+}
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+
 unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
 {
 	unsigned long long result = 0;
 
+	if (!base)
+		base = simple_guess_base(cp);
+
 	if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
 		cp += 2;
 
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index f3ceee20ff1..f6d52e65f97 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -170,7 +170,7 @@ static void early_serial_init(int baud)
 	outb(c & ~DLAB, early_serial_base + LCR);
 }
 
-void console_init(void)
+static int parse_earlyprintk(void)
 {
 	int baud = DEFAULT_BAUD;
 	char arg[32];
@@ -208,6 +208,63 @@ void console_init(void)
 			baud = DEFAULT_BAUD;
 	}
 
+	return baud;
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+	unsigned char lcr, dll, dlh;
+	unsigned int quot;
+
+	lcr = inb(port + LCR);
+	outb(lcr | DLAB, port + LCR);
+	dll = inb(port + DLL);
+	dlh = inb(port + DLH);
+	outb(lcr, port + LCR);
+	quot = (dlh << 8) | dll;
+
+	return BASE_BAUD / quot;
+}
+
+static int parse_console_uart8250(void)
+{
+	char optstr[64], *options;
+	int baud = DEFAULT_BAUD;
+
+	/*
+	 * console=uart8250,io,0x3f8,115200n8
+	 * need to make sure it is last one console !
+	 */
+	if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
+		return baud;
+
+	options = optstr;
+
+	if (!strncmp(options, "uart8250,io,", 12))
+		early_serial_base = simple_strtoull(options + 12, &options, 0);
+	else if (!strncmp(options, "uart,io,", 8))
+		early_serial_base = simple_strtoull(options + 8, &options, 0);
+	else
+		return baud;
+
+	if (options && (options[0] == ','))
+		baud = simple_strtoull(options + 1, &options, 0);
+	else
+		baud = probe_baud(early_serial_base);
+
+	return baud;
+}
+
+void console_init(void)
+{
+	int baud;
+
+	baud = parse_earlyprintk();
+
+	if (!early_serial_base)
+		baud = parse_console_uart8250();
+
 	if (early_serial_base != 0)
 		early_serial_init(baud);
 }
-- 
cgit v1.2.3


From df378ccfc4dd04e263426ad805516915874774aa Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 13 Jul 2010 14:55:11 -0700
Subject: x86, alternatives: Fix one more open-coded 8-bit alternative number

Fix a missing case of an 8-bit alternative number, buried inside an
assembly macro.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Reported-by: Yinghai Lu <yinhai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4C3BDDA3.2060900@kernel.org>
---
 arch/x86/lib/copy_user_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 71100c98e33..a460158b5ac 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -29,7 +29,7 @@
 	.align 8
 	.quad  0b
 	.quad  2b
-	.byte  \feature			/* when feature is set */
+	.word  \feature			/* when feature is set */
 	.byte  5
 	.byte  5
 	.previous
-- 
cgit v1.2.3


From 3b770a2128423a687e6e9c57184a584fb4ba4c77 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 13 Jul 2010 14:57:50 -0700
Subject: x86, alternatives: BUG on encountering an invalid CPU feature number

Make the alternatives-patching code BUG on encountering an invalid CPU
feature number.  Should have done this a long time ago.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Yinghai Lu <yinhai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <tip-df378ccfc4dd04e263426ad805516915874774aa@git.kernel.org>
---
 arch/x86/kernel/alternative.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c..f65ab8b014c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		u8 *instr = a->instr;
 		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
+		BUG_ON(a->cpuid >= NCAPINTS*32);
 		if (!boot_cpu_has(a->cpuid))
 			continue;
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From 70b0d22d581a5deef7b2876b0c3774635b8d846c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 14 Jul 2010 11:26:57 -0700
Subject: x86, setup: Only set early_serial_base after port is initialized

putchar is using early_serial_base to check if port is initialized.

So we only assign it after early_serial_init() is called,
in case we need use VGA to debug early serial console.

Also add display for port addr and baud.

-v2: update to current tip

Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4C3E0171.6050008@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/tty.c | 63 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index f6d52e65f97..ff4b27a0fc5 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -152,35 +152,40 @@ int getchar_timeout(void)
 	return 0;		/* Timeout! */
 }
 
-static void early_serial_init(int baud)
+static void early_serial_init(int port, int baud)
 {
 	unsigned char c;
 	unsigned divisor;
 
-	outb(0x3, early_serial_base + LCR);	/* 8n1 */
-	outb(0, early_serial_base + IER);	/* no interrupt */
-	outb(0, early_serial_base + FCR);	/* no fifo */
-	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+	outb(0x3, port + LCR);	/* 8n1 */
+	outb(0, port + IER);	/* no interrupt */
+	outb(0, port + FCR);	/* no fifo */
+	outb(0x3, port + MCR);	/* DTR + RTS */
 
 	divisor	= 115200 / baud;
-	c = inb(early_serial_base + LCR);
-	outb(c | DLAB, early_serial_base + LCR);
-	outb(divisor & 0xff, early_serial_base + DLL);
-	outb((divisor >> 8) & 0xff, early_serial_base + DLH);
-	outb(c & ~DLAB, early_serial_base + LCR);
+	c = inb(port + LCR);
+	outb(c | DLAB, port + LCR);
+	outb(divisor & 0xff, port + DLL);
+	outb((divisor >> 8) & 0xff, port + DLH);
+	outb(c & ~DLAB, port + LCR);
+
+	early_serial_base = port;
+
+	printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud);
 }
 
-static int parse_earlyprintk(void)
+static void parse_earlyprintk(void)
 {
 	int baud = DEFAULT_BAUD;
 	char arg[32];
 	int pos = 0;
+	int port = 0;
 
 	if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
 		char *e;
 
 		if (!strncmp(arg, "serial", 6)) {
-			early_serial_base = DEFAULT_SERIAL_PORT;
+			port = DEFAULT_SERIAL_PORT;
 			pos += 6;
 		}
 
@@ -189,15 +194,15 @@ static int parse_earlyprintk(void)
 
 		if (!strncmp(arg, "ttyS", 4)) {
 			static const int bases[] = { 0x3f8, 0x2f8 };
-			int port = 0;
+			int idx = 0;
 
 			if (!strncmp(arg + pos, "ttyS", 4))
 				pos += 4;
 
 			if (arg[pos++] == '1')
-				port = 1;
+				idx = 1;
 
-			early_serial_base = bases[port];
+			port = bases[idx];
 		}
 
 		if (arg[pos] == ',')
@@ -208,7 +213,8 @@ static int parse_earlyprintk(void)
 			baud = DEFAULT_BAUD;
 	}
 
-	return baud;
+	if (port)
+		early_serial_init(port, baud);
 }
 
 #define BASE_BAUD (1843200/16)
@@ -227,44 +233,41 @@ static unsigned int probe_baud(int port)
 	return BASE_BAUD / quot;
 }
 
-static int parse_console_uart8250(void)
+static void parse_console_uart8250(void)
 {
 	char optstr[64], *options;
 	int baud = DEFAULT_BAUD;
+	int port = 0;
 
 	/*
 	 * console=uart8250,io,0x3f8,115200n8
 	 * need to make sure it is last one console !
 	 */
 	if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
-		return baud;
+		return;
 
 	options = optstr;
 
 	if (!strncmp(options, "uart8250,io,", 12))
-		early_serial_base = simple_strtoull(options + 12, &options, 0);
+		port = simple_strtoull(options + 12, &options, 0);
 	else if (!strncmp(options, "uart,io,", 8))
-		early_serial_base = simple_strtoull(options + 8, &options, 0);
+		port = simple_strtoull(options + 8, &options, 0);
 	else
-		return baud;
+		return;
 
 	if (options && (options[0] == ','))
 		baud = simple_strtoull(options + 1, &options, 0);
 	else
-		baud = probe_baud(early_serial_base);
+		baud = probe_baud(port);
 
-	return baud;
+	if (port)
+		early_serial_init(port, baud);
 }
 
 void console_init(void)
 {
-	int baud;
-
-	baud = parse_earlyprintk();
+	parse_earlyprintk();
 
 	if (!early_serial_base)
-		baud = parse_console_uart8250();
-
-	if (early_serial_base != 0)
-		early_serial_init(baud);
+		parse_console_uart8250();
 }
-- 
cgit v1.2.3


From b2691085d1f3ccce641dcfdd02722ba5d34db6ba Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 28 Jun 2010 16:46:48 -0700
Subject: x86: Clean up arch/x86/kernel/cpu/mtrr/cleanup.c: use ";" not "," to
 terminate statements

Also needed if pr_<level> becomes a bit more space efficient.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
LKML-Reference: <1277768808.29157.280.camel@Joe-Laptop.home>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f01..c5f59d07142 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
 	unsigned long gran_base, chunk_base, lose_base;
 	char gran_factor, chunk_factor, lose_factor;
 
-	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
-	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
-	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
 
 	pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
 		result[i].bad ? "*BAD*" : " ",
-- 
cgit v1.2.3


From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 6 May 2010 20:17:00 +0200
Subject: unistd: add __NR_prlimit64 syscall numbers

Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to
asm-x86, both 32 and 64-bit.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa..a88e31d1836 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,5 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_prlimit64
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a..35e0cb151b6 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_prlimit64		338
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 339
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81..570bf5eae56 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_prlimit64				300
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b372934121..eca1d7d23ab 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_prlimit64
-- 
cgit v1.2.3


From 93a7ca0c3ebe5d931126f1fb732cb9c4518383d4 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 16 Jul 2010 10:11:21 -0500
Subject: x86, UV: Initialize BAU MMRs only on hubs with cpus

Remove the initialization of MMRs
UVH_LB_BAU_SB_ACTIVATION_CONTROL and UVH_BAU_DATA_BROADCAST on
UV hubs that have no active cpus. Such initialization on hubs
with no active cpus would result in a kernel page fault.

This is not of real high priority, because we don't have any
such systems (with UV hubs that have no active cpus).  But they
will be coming.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1OZmZN-0006cW-RC@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index abf3c31f14c..59efb5390b3 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1635,12 +1635,16 @@ static int __init uv_bau_init(void)
 	alloc_intr_gate(vector, uv_bau_message_intr1);
 
 	for_each_possible_blade(uvhub) {
-		pnode = uv_blade_to_pnode(uvhub);
-		/* INIT the bau */
-		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-				      ((unsigned long)1 << 63));
-		mmr = 1; /* should be 1 to broadcast to both sockets */
-		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+		if (uv_blade_nr_possible_cpus(uvhub)) {
+			pnode = uv_blade_to_pnode(uvhub);
+			/* INIT the bau */
+			uv_write_global_mmr64(pnode,
+					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+					((unsigned long)1 << 63));
+			mmr = 1; /* should be 1 to broadcast to both sockets */
+			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+						mmr);
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From a2531293dbb7608fa672ff28efe3ab4027917a2f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 18 Jul 2010 14:27:13 +0200
Subject: update email address

pavel@suse.cz no longer works, replace it with working address.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/acpi/sleep.c              | 2 +-
 arch/x86/kernel/apm_32.c                  | 2 +-
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +-
 arch/x86/mm/init_64.c                     | 2 +-
 arch/x86/power/cpu.c                      | 2 +-
 arch/x86/power/hibernate_64.c             | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b9..f51cc55aced 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
  * sleep.c - x86-specific ACPI sleep support.
  *
  *  Copyright (C) 2001-2003 Patrick Mochel
- *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
  */
 
 #include <linux/acpi.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3a..4c9c67bf09b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
  *         is now the way life works).
  *         Fix thinko in suspend() (wrong return).
  *         Notify drivers on critical suspend.
- *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ *         Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
  *         modified by sfr).
  *         Disable interrupts while we are suspended (Andy Henroid
  *         <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7ec2123838e..0af9aa20fce 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
  *  Based on the powernow-k7.c module written by Dave Jones.
  *  (C) 2003 Dave Jones on behalf of SuSE Labs
  *  (C) 2004 Dominik Brodowski <linux@brodo.de>
- *  (C) 2004 Pavel Machek <pavel@suse.cz>
+ *  (C) 2004 Pavel Machek <pavel@ucw.cz>
  *  Licensed under the terms of the GNU GPL License version 2.
  *  Based upon datasheets & sample CPUs kindly provided by AMD.
  *
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d..9a6674689a2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
  *  linux/arch/x86_64/mm/init.c
  *
  *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
  */
 
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b35..e7e8c5f5495 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
  * Distribute under GPLv2
  *
  * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e..460f314d13e 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
  * Distribute under GPLv2
  *
  * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
-- 
cgit v1.2.3


From 6c54aabd5e687092557f4881ce2d4013b971f293 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 3 Jul 2010 12:03:51 -0400
Subject: x86/amd-iommu: Use for_each_pci_dev()

Use for_each_pci_dev() to simplify the code.

Signed-off-by: Kulikov Vasiliy <segooon@gmail.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c..29dd3b9f2f0 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2609,8 +2609,7 @@ int __init amd_iommu_init_passthrough(void)
 
 	pt_domain->mode |= PAGE_MODE_NONE;
 
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
+	for_each_pci_dev(dev) {
 		if (!check_device(&dev->dev))
 			continue;
 
-- 
cgit v1.2.3


From edb18f8ab02843453306601c4aa697f9691129cd Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:50 -0700
Subject: x86, cpu: Make init_scattered_cpuid_features() consider cpuid
 subleaves

Some cpuid features (like xsaveopt) are enumerated using cpuid
subleaves.

Extend init_scattered_cpuid_features() to take subleaf into account.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.439900717@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 7369b4c2c55..03cf24a3d93 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -14,6 +14,7 @@ struct cpuid_bit {
 	u8 reg;
 	u8 bit;
 	u32 level;
+	u32 sub_leaf;
 };
 
 enum cpuid_regs {
@@ -30,16 +31,16 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006 },
-		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
-		{ 0, 0, 0, 0 }
+		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
+		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
+		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
+		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
+		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
+		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ 0, 0, 0, 0, 0 }
 	};
 
 	for (cb = cpuid_bits; cb->feature; cb++) {
@@ -50,8 +51,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		    max_level > (cb->level | 0xffff))
 			continue;
 
-		cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
-			&regs[CR_ECX], &regs[CR_EDX]);
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
 
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
-- 
cgit v1.2.3


From 40e1d7a4ffee5cb17f5c36f4c3c4a011ab103ebe Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:51 -0700
Subject: x86, cpu: Add xsaveopt cpufeature

Add cpu feature bit support for the XSAVEOPT instruction.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.523204988@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3ec9275cea4..d5ea3e3a8a4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -165,6 +165,7 @@
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_XSAVEOPT	(7*32+4) /* "xsaveopt" Optimized Xsave */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
-- 
cgit v1.2.3


From 5734f62b6601d88fd8ec720cb56b93fd3a030557 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:52 -0700
Subject: x86, cpu: Enumerate xsaveopt

Enumerate the xsaveopt feature.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 03cf24a3d93..41eebcd90fc 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -35,6 +35,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
 		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
-- 
cgit v1.2.3


From a1488f8bf4d72ad724700f6e982469a1240e4264 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:48 -0700
Subject: x86, xsave: Track the offset, size of state in the xsave layout

Subleaves of the cpuid vector 0xd provides the offset and size of different
feature state that are managed by the xsave/xrstor. Track this for the upcoming
usage during signal handling.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.262987929@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 980149867a1..4993caa4181 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -21,6 +21,8 @@ struct _fpx_sw_bytes fx_sw_reserved;
 struct _fpx_sw_bytes fx_sw_reserved_ia32;
 #endif
 
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
@@ -301,6 +303,31 @@ void __cpuinit xsave_init(void)
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
+/*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void setup_xstate_features(void)
+{
+	int eax, ebx, ecx, edx, leaf = 0x2;
+
+	xstate_features = fls64(pcntxt_mask);
+	xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+
+	do {
+		cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx);
+
+		if (eax == 0)
+			break;
+
+		xstate_offsets[leaf] = ebx;
+		xstate_sizes[leaf] = eax;
+
+		leaf++;
+	} while (1);
+}
+
 /*
  * setup the xstate image representing the init state
  */
@@ -308,6 +335,8 @@ static void __init setup_xstate_init(void)
 {
 	init_xstate_buf = alloc_bootmem(xstate_size);
 	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+
+	setup_xstate_features();
 }
 
 /*
-- 
cgit v1.2.3


From 29104e101d710dd152f807978884643a52eca8b7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:49 -0700
Subject: x86, xsave: Sync xsave memory layout with its header for user
 handling

With xsaveopt, if a processor implementation discern that a processor state
component is in its initialized state it may modify the corresponding bit in
the xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
layout. Hence wHile presenting the xstate information to the user, we always
ensure that the memory layout of a feature will be in the init state if the
corresponding header bit is zero. This ensures the consistency and avoids the
condition of the user seeing some some stale state in the memory layout during
signal handling, debugging etc.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.351459480@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 14 +++++++
 arch/x86/include/asm/xsave.h | 10 +++++
 arch/x86/kernel/i387.c       | 11 ++++++
 arch/x86/kernel/xsave.c      | 89 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 123 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..bb370fd0a1c 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -58,11 +58,25 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return 0;
+}
+
 static __always_inline __pure bool use_xsave(void)
 {
 	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
+extern void __sanitize_i387_state(struct task_struct *);
+
+static inline void sanitize_i387_state(struct task_struct *tsk)
+{
+	if (!use_xsaveopt())
+		return;
+	__sanitize_i387_state(tsk);
+}
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..0c72adc0cb1 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -111,6 +111,16 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
 		     :   "memory");
 }
 
+static inline void xsave_state(struct xsave_struct *fx, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+		     : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+		     :   "memory");
+}
+
 static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b3225..6106af9fd12 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -190,6 +190,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				   &target->thread.fpu.state->fxsave, 0, -1);
 }
@@ -207,6 +209,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 &target->thread.fpu.state->fxsave, 0, -1);
 
@@ -446,6 +450,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 					   -1);
 	}
 
+	sanitize_i387_state(target);
+
 	if (kbuf && pos == 0 && count == sizeof(env)) {
 		convert_from_fxsr(kbuf, target);
 		return 0;
@@ -467,6 +473,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	if (!HAVE_HWFP)
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
@@ -533,6 +541,9 @@ static int save_i387_xsave(void __user *buf)
 	struct _fpstate_ia32 __user *fx = buf;
 	int err = 0;
 
+
+	sanitize_i387_state(tsk);
+
 	/*
 	 * For legacy compatible, we always set FP/SSE bits in the bit
 	 * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4993caa4181..368047c8d50 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -23,6 +23,76 @@ struct _fpx_sw_bytes fx_sw_reserved_ia32;
 
 static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
 
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+	u64 xstate_bv;
+	int feature_bit = 0x2;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+
+	if (!fx)
+		return;
+
+	BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+
+	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+
+	/*
+	 * None of the feature bits are in init state. So nothing else
+	 * to do for us, as the memory layout is upto date.
+	 */
+	if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+		return;
+
+	/*
+	 * FP is in init state
+	 */
+	if (!(xstate_bv & XSTATE_FP)) {
+		fx->cwd = 0x37f;
+		fx->swd = 0;
+		fx->twd = 0;
+		fx->fop = 0;
+		fx->rip = 0;
+		fx->rdp = 0;
+		memset(&fx->st_space[0], 0, 128);
+	}
+
+	/*
+	 * SSE is in init state
+	 */
+	if (!(xstate_bv & XSTATE_SSE))
+		memset(&fx->xmm_space[0], 0, 256);
+
+	xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+
+	/*
+	 * Update all the other memory layouts for which the corresponding
+	 * header bit is in the init state.
+	 */
+	while (xstate_bv) {
+		if (xstate_bv & 0x1) {
+			int offset = xstate_offsets[feature_bit];
+			int size = xstate_sizes[feature_bit];
+
+			memcpy(((void *) fx) + offset,
+			       ((void *) init_xstate_buf) + offset,
+			       size);
+		}
+
+		xstate_bv >>= 1;
+		feature_bit++;
+	}
+}
+
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
@@ -112,6 +182,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
+		sanitize_i387_state(tsk);
 		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
@@ -333,10 +404,26 @@ static void setup_xstate_features(void)
  */
 static void __init setup_xstate_init(void)
 {
+	setup_xstate_features();
+
+	/*
+	 * Setup init_xstate_buf to represent the init state of
+	 * all the features managed by the xsave
+	 */
 	init_xstate_buf = alloc_bootmem(xstate_size);
 	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
 
-	setup_xstate_features();
+	clts();
+	/*
+	 * Init all the features state with header_bv being 0x0
+	 */
+	xrstor_state(init_xstate_buf, -1);
+	/*
+	 * Dump the init state again. This is to identify the init state
+	 * of any feature which is not represented by all zero's.
+	 */
+	xsave_state(init_xstate_buf, -1);
+	stts();
 }
 
 /*
-- 
cgit v1.2.3


From 6bad06b768920e278c7cedfdda56a0b4c6a35ee9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:52 -0700
Subject: x86, xsave: Use xsaveopt in context-switch path when supported

xsaveopt is a more optimized form of xsave specifically designed
for the context switch usage. xsaveopt doesn't save the state that's not
modified from the prior xrstor. And if a specific feature state gets
modified to the init state, then xsaveopt just updates the header bit
in the xsave memory layout without updating the corresponding memory
layout.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 2 +-
 arch/x86/include/asm/xsave.h | 9 ++++++---
 arch/x86/kernel/cpu/common.c | 8 ++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index bb370fd0a1c..59bd93ac7fe 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -60,7 +60,7 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 static __always_inline __pure bool use_xsaveopt(void)
 {
-	return 0;
+	return static_cpu_has(X86_FEATURE_XSAVEOPT);
 }
 
 static __always_inline __pure bool use_xsave(void)
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 0c72adc0cb1..ec86c5fd6a6 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -125,8 +125,11 @@ static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
-			     : : "D" (&(fpu->state->xsave)),
-				 "a" (-1), "d"(-1) : "memory");
+	alternative_input(
+		".byte " REX_PREFIX "0x0f,0xae,0x27",
+		".byte " REX_PREFIX "0x0f,0xae,0x37",
+		X86_FEATURE_XSAVEOPT,
+		[fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+		"memory");
 }
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7358303d8c..3f715efc594 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 static int __init x86_xsave_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
 	return 1;
 }
 __setup("noxsave", x86_xsave_setup);
 
+static int __init x86_xsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-- 
cgit v1.2.3


From 278bc5f6abd69dd868746dbd642266ac09a9c9c6 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 19 Jul 2010 18:53:51 -0700
Subject: x86, cpu: Clean up formatting in cpufeature.h, remove override

Clean up the formatting in cpufeature.h, and remove an unnecessary
name override.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d5ea3e3a8a4..4be50ddd4d7 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -165,7 +165,7 @@
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_XSAVEOPT	(7*32+4) /* "xsaveopt" Optimized Xsave */
+#define X86_FEATURE_XSAVEOPT	(7*32+ 4) /* Optimized Xsave */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
@@ -173,13 +173,13 @@
 #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
 #define X86_FEATURE_EPT         (8*32+ 3) /* Intel Extended Page Table */
 #define X86_FEATURE_VPID        (8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT		(8*32+5)  /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV	(8*32+6)  /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML	(8*32+7)  /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS	(8*32+8)  /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_NPT		(8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV	(8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML	(8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS	(8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE	(9*32+0)  /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
-- 
cgit v1.2.3


From 2decb194e65ab66eaf787512dc572cdc99893b24 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 19 Jul 2010 18:32:04 -0700
Subject: x86, cpu: Split addon_cpuid_features.c

addon_cpuid_features.c contains exactly two almost completely
unrelated functions, plus has a long and very generic name.  Split it
into two files, scattered.c for the scattered feature flags, and
topology.c for the topology information.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/kernel/cpu/Makefile               |   2 +-
 arch/x86/kernel/cpu/addon_cpuid_features.c | 150 -----------------------------
 arch/x86/kernel/cpu/scattered.c            |  61 ++++++++++++
 arch/x86/kernel/cpu/topology.c             |  99 +++++++++++++++++++
 4 files changed, 161 insertions(+), 151 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/addon_cpuid_features.c
 create mode 100644 arch/x86/kernel/cpu/scattered.c
 create mode 100644 arch/x86/kernel/cpu/topology.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6..5e3a3512ba0 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,7 +12,7 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
 
-obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
+obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
deleted file mode 100644
index 41eebcd90fc..00000000000
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *	Routines to indentify additional cpu features that are scattered in
- *	cpuid space.
- */
-#include <linux/cpu.h>
-
-#include <asm/pat.h>
-#include <asm/processor.h>
-
-#include <asm/apic.h>
-
-struct cpuid_bit {
-	u16 feature;
-	u8 reg;
-	u8 bit;
-	u32 level;
-	u32 sub_leaf;
-};
-
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
-	u32 max_level;
-	u32 regs[4];
-	const struct cpuid_bit *cb;
-
-	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
-		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
-		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
-		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
-		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
-		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
-		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
-		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
-		{ 0, 0, 0, 0, 0 }
-	};
-
-	for (cb = cpuid_bits; cb->feature; cb++) {
-
-		/* Verify that the level is valid */
-		max_level = cpuid_eax(cb->level & 0xffff0000);
-		if (max_level < cb->level ||
-		    max_level > (cb->level | 0xffff))
-			continue;
-
-		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
-			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
-
-		if (regs[cb->reg] & (1 << cb->bit))
-			set_cpu_cap(c, cb->feature);
-	}
-}
-
-/* leaf 0xb SMT level */
-#define SMT_LEVEL	0
-
-/* leaf 0xb sub-leaf types */
-#define INVALID_TYPE	0
-#define SMT_TYPE	1
-#define CORE_TYPE	2
-
-#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
-
-/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned int eax, ebx, ecx, edx, sub_index;
-	unsigned int ht_mask_width, core_plus_mask_width;
-	unsigned int core_select_mask, core_level_siblings;
-	static bool printed;
-
-	if (c->cpuid_level < 0xb)
-		return;
-
-	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
-
-	/*
-	 * check if the cpuid leaf 0xb is actually implemented.
-	 */
-	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
-		return;
-
-	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
-
-	/*
-	 * initial apic id, which also represents 32-bit extended x2apic id.
-	 */
-	c->initial_apicid = edx;
-
-	/*
-	 * Populate HT related information from sub-leaf level 0.
-	 */
-	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
-	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-
-	sub_index = 1;
-	do {
-		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
-
-		/*
-		 * Check for the Core type in the implemented sub leaves.
-		 */
-		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
-			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
-			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-			break;
-		}
-
-		sub_index++;
-	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-
-	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-
-	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
-						 & core_select_mask;
-	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
-	/*
-	 * Reinit the apicid, now that we have extended initial_apicid.
-	 */
-	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-
-	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
-	if (!printed) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       c->cpu_core_id);
-		printed = 1;
-	}
-	return;
-#endif
-}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 00000000000..9815364b477
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,61 @@
+/*
+ *	Routines to indentify additional cpu features that are scattered in
+ *	cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+	u16 feature;
+	u8 reg;
+	u8 bit;
+	u32 level;
+	u32 sub_leaf;
+};
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+	u32 max_level;
+	u32 regs[4];
+	const struct cpuid_bit *cb;
+
+	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
+		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
+		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
+		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
+		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
+		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
+		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ 0, 0, 0, 0, 0 }
+	};
+
+	for (cb = cpuid_bits; cb->feature; cb++) {
+
+		/* Verify that the level is valid */
+		max_level = cpuid_eax(cb->level & 0xffff0000);
+		if (max_level < cb->level ||
+		    max_level > (cb->level | 0xffff))
+			continue;
+
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+
+		if (regs[cb->reg] & (1 << cb->bit))
+			set_cpu_cap(c, cb->feature);
+	}
+}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 00000000000..4397e987a1c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,99 @@
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+
+#include <linux/cpu.h>
+#include <asm/apic.h>
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+/* leaf 0xb SMT level */
+#define SMT_LEVEL	0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE	0
+#define SMT_TYPE	1
+#define CORE_TYPE	2
+
+#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned int eax, ebx, ecx, edx, sub_index;
+	unsigned int ht_mask_width, core_plus_mask_width;
+	unsigned int core_select_mask, core_level_siblings;
+	static bool printed;
+
+	if (c->cpuid_level < 0xb)
+		return;
+
+	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+	/*
+	 * check if the cpuid leaf 0xb is actually implemented.
+	 */
+	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+	/*
+	 * initial apic id, which also represents 32-bit extended x2apic id.
+	 */
+	c->initial_apicid = edx;
+
+	/*
+	 * Populate HT related information from sub-leaf level 0.
+	 */
+	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+	sub_index = 1;
+	do {
+		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+		/*
+		 * Check for the Core type in the implemented sub leaves.
+		 */
+		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+			break;
+		}
+
+		sub_index++;
+	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
+						 & core_select_mask;
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+	/*
+	 * Reinit the apicid, now that we have extended initial_apicid.
+	 */
+	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
+	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+
+	if (!printed) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		if (c->x86_max_cores > 1)
+			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+			       c->cpu_core_id);
+		printed = 1;
+	}
+	return;
+#endif
+}
-- 
cgit v1.2.3


From 093d7b4639951ea3021a6f70d376c3ff31f4740c Mon Sep 17 00:00:00 2001
From: Miroslav Rezanina <mrezanin@redhat.com>
Date: Wed, 16 Sep 2009 03:56:17 -0400
Subject: xen: release unused free memory

Scan an e820 table and release any memory which lies between e820 entries,
as it won't be used and would just be wasted.  At present this is just to
release any memory beyond the end of the e820 map, but it will also deal
with holes being punched in the map.

Derived from patch by Miroslav Rezanina <mrezanin@redhat.com>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c |  1 -
 arch/x86/xen/setup.c     | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..399bed2de88 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -731,7 +731,6 @@ static void set_xen_basic_apic_ops(void)
 
 #endif
 
-
 static void xen_clts(void)
 {
 	struct multicall_space mcs;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd..e0942630d47 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include <xen/page.h>
 #include <xen/interface/callback.h>
 #include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 
 #include "xen-ops.h"
@@ -32,6 +33,56 @@ extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+	unsigned long start, end;
+	unsigned long len;
+	unsigned long pfn;
+	int ret;
+
+	start = PFN_UP(start_addr);
+	end = PFN_UP(end_addr);
+
+	if (end <= start)
+		return 0;
+
+	len = end - start;
+
+	set_xen_guest_handle(reservation.extent_start, &mfn_list[start]);
+	reservation.nr_extents = len;
+
+	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+	WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n",
+	     start, end, ret);
+
+	for(pfn = start; pfn < end; pfn++)
+		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+
+	return len;
+}
+
+static unsigned long __init xen_return_unused_memory(const struct e820map *e820)
+{
+	unsigned long last_end = 0;
+	unsigned long released = 0;
+	int i;
+
+	for (i = 0; i < e820->nr_map; i++) {
+		released += xen_release_chunk(last_end, e820->map[i].addr);
+		last_end = e820->map[i].addr + e820->map[i].size;
+	}
+
+	released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages));
+
+	printk(KERN_INFO "released %ld pages of unused memory\n", released);
+	return released;
+}
 
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -67,6 +118,8 @@ char * __init xen_memory_setup(void)
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
+	xen_return_unused_memory(&e820);
+
 	return "Xen";
 }
 
-- 
cgit v1.2.3


From f89e048e76da7ac0b4c89e75606ca7a3422886b1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 16 Sep 2009 12:38:33 -0700
Subject: xen: make sure pages are really part of domain before freeing

Scan the set of pages we're freeing and make sure they're actually
owned by the domain before freeing.  This generally won't happen on a
domU (since Xen gives us contigious memory), but it could happen if
there are some hardware mappings passed through.

We only bother going up to the highest page Xen actually claimed to
give us, since there's definitely nothing of ours above that.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 59 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0942630d47..9deb6bab6c7 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,52 +33,69 @@ extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
-static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr)
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
+					      phys_addr_t end_addr)
 {
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
-	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long start, end;
-	unsigned long len;
+	unsigned long len = 0;
 	unsigned long pfn;
 	int ret;
 
 	start = PFN_UP(start_addr);
-	end = PFN_UP(end_addr);
+	end = PFN_DOWN(end_addr);
 
 	if (end <= start)
 		return 0;
 
-	len = end - start;
-
-	set_xen_guest_handle(reservation.extent_start, &mfn_list[start]);
-	reservation.nr_extents = len;
-
-	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
-	WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n",
-	     start, end, ret);
-
-	for(pfn = start; pfn < end; pfn++)
-		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+	printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
+	       start, end);
+	for(pfn = start; pfn < end; pfn++) {
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		/* Make sure pfn exists to start with */
+		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+			continue;
+
+		set_xen_guest_handle(reservation.extent_start, &mfn);
+		reservation.nr_extents = 1;
+
+		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					   &reservation);
+		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
+		     start, end, ret);
+		if (ret == 1) {
+			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+			len++;
+		}
+	}
+	printk(KERN_CONT "%ld pages freed\n", len);
 
 	return len;
 }
 
-static unsigned long __init xen_return_unused_memory(const struct e820map *e820)
+static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
+						     const struct e820map *e820)
 {
-	unsigned long last_end = 0;
+	phys_addr_t max_addr = PFN_PHYS(max_pfn);
+	phys_addr_t last_end = 0;
 	unsigned long released = 0;
 	int i;
 
-	for (i = 0; i < e820->nr_map; i++) {
-		released += xen_release_chunk(last_end, e820->map[i].addr);
+	for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
+		phys_addr_t end = e820->map[i].addr;
+		end = min(max_addr, end);
+
+		released += xen_release_chunk(last_end, end);
 		last_end = e820->map[i].addr + e820->map[i].size;
 	}
 
-	released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages));
+	if (last_end < max_addr)
+		released += xen_release_chunk(last_end, max_addr);
 
 	printk(KERN_INFO "released %ld pages of unused memory\n", released);
 	return released;
@@ -118,7 +135,7 @@ char * __init xen_memory_setup(void)
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-	xen_return_unused_memory(&e820);
+	xen_return_unused_memory(xen_start_info->nr_pages, &e820);
 
 	return "Xen";
 }
-- 
cgit v1.2.3


From 5f755293ca61520b70b11afe1b1d6e1635cb6c00 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 20 Jul 2010 15:19:48 -0700
Subject: x86, gcc-4.6: Avoid unused by set variables in rdmsr

Avoids quite a lot of warnings with a gcc 4.6 -Wall build
because this happens in a commonly used header file (apic.h)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <201007202219.o6KMJme6021066@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/msr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f..084ef95274c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
 #define rdmsr(msr, val1, val2)					\
 do {								\
 	u64 __val = native_read_msr((msr));			\
-	(val1) = (u32)__val;					\
-	(val2) = (u32)(__val >> 32);				\
+	(void)((val1) = (u32)__val);				\
+	(void)((val2) = (u32)(__val >> 32));			\
 } while (0)
 
 static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
-- 
cgit v1.2.3


From fa10ba64ac94fec4611b79804023eb087862ffe0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 20 Jul 2010 15:19:49 -0700
Subject: x86, gcc-4.6: Fix set but not read variables

Just some dead code, no real bugs.

Found by gcc 4.6 -Wall

Signed-off-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <201007202219.o6KMJnQ0021072@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/aperture_64.c      | 4 ++--
 arch/x86/kernel/cpu/mtrr/generic.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf23..a2e0caf26e1 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
 	 * or BIOS forget to put that in reserved.
 	 * try to update e820 to make that region as reserved.
 	 */
-	u32 agp_aper_base = 0, agp_aper_order = 0;
+	u32 agp_aper_order = 0;
 	int i, fix, slot, valid_agp = 0;
 	u32 ctl;
 	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
 		return;
 
 	/* This is mostly duplicate of iommu_hole_init */
-	agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+	search_agp_bridge(&agp_aper_order, &valid_agp);
 
 	fix = 0;
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61..7d28d7d0388 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 	unsigned int tmp, hi;
-	int cpu;
 
 	/*
 	 * get_mtrr doesn't need to update mtrr_state, also it could be called
 	 * from any cpu, so try to print it out directly.
 	 */
-	cpu = get_cpu();
+	get_cpu();
 
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
 
-- 
cgit v1.2.3


From 7aa2b5f8ec60505160df1c25398e8286c8432689 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 20 Jul 2010 20:50:48 +0200
Subject: x86, xsave: Do not include asm/i387.h in asm/xsave.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no dependencies to asm/i387.h. Instead, if including only
xsave.h the following error occurs:

 .../arch/x86/include/asm/i387.h:110: error: ‘XSTATE_FP’ undeclared (first use in this function)
 .../arch/x86/include/asm/i387.h:110: error: (Each undeclared identifier is reported only once
 .../arch/x86/include/asm/i387.h:110: error: for each function it appears in.)

This patch fixes this.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279651857-24639-2-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/xsave.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ec86c5fd6a6..94d5f84d89f 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
 
 #define XSTATE_FP	0x1
 #define XSTATE_SSE	0x2
-- 
cgit v1.2.3


From db10db48b2c530def21bfd76d576702c7df7f620 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 20 Jul 2010 20:50:49 +0200
Subject: x86, xsave: 32/64 bit boot cpu check unification in initialization

Boot cpu id is always 0, thus simplifying and unifying boot cpu check.

boot_cpu_id is there for historical reasons and was renamed to
boot_cpu_physical_apicid in patch:

 c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid

However, there are some remaining occurrences of boot_cpu_id that are
never touched in the kernel and thus its value is always 0.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279651857-24639-3-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3f715efc594..26804b2986b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1273,7 +1273,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Boot processor to setup the FP and extended state context info.
 	 */
-	if (smp_processor_id() == boot_cpu_id)
+	if (!smp_processor_id())
 		init_thread_xstate();
 
 	xsave_init();
-- 
cgit v1.2.3


From 82d4150cec83b9775f84810b39a1c0b91585d429 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 20 Jul 2010 20:50:51 +0200
Subject: x86, xsave: Move boot cpu initialization to xsave_init()

This patch moves boot cpu initialization to xsave_init(). Now all cpus
are initialized in one single function.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279651857-24639-5-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c |  6 ------
 arch/x86/kernel/i387.c       |  5 -----
 arch/x86/kernel/xsave.c      | 14 ++++++++++++--
 3 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 26804b2986b..40561085d4f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1270,12 +1270,6 @@ void __cpuinit cpu_init(void)
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		init_thread_xstate();
-
 	xsave_init();
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 6106af9fd12..2f32ef05f10 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -93,11 +93,6 @@ void __cpuinit fpu_init(void)
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		init_thread_xstate();
 	xsave_init();
 
 	mxcsr_feature_mask_init();
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 368047c8d50..ab9ad48b653 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,7 +360,7 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-void __cpuinit xsave_init(void)
+static void __cpuinit __xsave_init(void)
 {
 	if (!cpu_has_xsave)
 		return;
@@ -446,7 +446,7 @@ void __ref xsave_cntxt_init(void)
 	 * Support only the state known to OS.
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-	xsave_init();
+	__xsave_init();
 
 	/*
 	 * Recompute the context size for enabled features
@@ -463,3 +463,13 @@ void __ref xsave_cntxt_init(void)
 	       "cntxt size 0x%x\n",
 	       pcntxt_mask, xstate_size);
 }
+
+void __cpuinit xsave_init(void)
+{
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
+	__xsave_init();
+}
-- 
cgit v1.2.3


From 92851e2fca48f1893f899963c13b55b61ac6956c Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 20 Jul 2010 15:19:46 -0700
Subject: x86, mm: Create symbolic index into address_markers array

Without this, adding entries into the address_markers array means adding
more and more of an #ifdef maze in pt_dump_init().  By using indices, we
can keep it a bit saner.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <201007202219.o6KMJkUs021052@imap1.linux-foundation.org>
Cc: Jordan Crouse <jordan.crouse@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/dump_pagetables.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760a..0002a3a3308 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
 	const char *name;
 };
 
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+	USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+	KERNEL_SPACE_NR,
+	LOW_KERNEL_NR,
+	VMALLOC_START_NR,
+	VMEMMAP_START_NR,
+	HIGH_KERNEL_NR,
+	MODULES_VADDR_NR,
+	MODULES_END_NR,
+#else
+	KERNEL_SPACE_NR,
+	VMALLOC_START_NR,
+	VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+	PKMAP_BASE_NR,
+# endif
+	FIXADDR_START_NR,
+#endif
+};
+
 /* Address space markers hints */
 static struct addr_marker address_markers[] = {
 	{ 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
 
 #ifdef CONFIG_X86_32
 	/* Not a compile-time constant on x86-32 */
-	address_markers[2].start_address = VMALLOC_START;
-	address_markers[3].start_address = VMALLOC_END;
+	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
 # ifdef CONFIG_HIGHMEM
-	address_markers[4].start_address = PKMAP_BASE;
-	address_markers[5].start_address = FIXADDR_START;
-# else
-	address_markers[4].start_address = FIXADDR_START;
+	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 # endif
+	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 #endif
 
 	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
-- 
cgit v1.2.3


From 468c30f2bbdf1ba0fbf16667eade23a46eaa8f06 Mon Sep 17 00:00:00 2001
From: Florian Zumbiehl <florz@florz.de>
Date: Tue, 20 Jul 2010 15:19:47 -0700
Subject: x86, iomap: Fix wrong page aligned size calculation in ioremapping
 code

x86 early_iounmap(): fix off-by-one error in page alignment of allocation
size for sizes where size%PAGE_SIZE==1.

Signed-off-by: Florian Zumbiehl <florz@florz.de>
LKML-Reference: <201007202219.o6KMJlES021058@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/ioremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d41d3a9036c..3ba6e0608c5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -611,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 		return;
 	}
 	offset = virt_addr & ~PAGE_MASK;
-	nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
 
 	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
 	while (nrpages > 0) {
-- 
cgit v1.2.3


From a751bd858b16dce57f3b6b85ba07946df1bd7be4 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 20 Jul 2010 15:19:45 -0700
Subject: x86, rwsem: Stay on fast path when count > 0 in __up_write()

When count > 0 there is no need to take the call_rwsem_wake path.  If
we did take that path, it would just return without doing anything due
to the active count not being zero.

Signed-off-by: Michel Lespinasse <walken@google.com>
LKML-Reference: <201007202219.o6KMJj9x021042@imap1.linux-foundation.org>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Mike Waychison <mikew@google.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/rwsem.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede12697..5bf5e04e497 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -216,9 +216,8 @@ static inline void __up_write(struct rw_semaphore *sem)
 	rwsem_count_t tmp;
 	asm volatile("# beginning __up_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
-		     /* tries to transition
-			0xffff0001 -> 0x00000000 */
-		     "  jz       1f\n"
+		     /* subtracts 0xffff0001, returns the old value */
+		     "  jns        1f\n\t"
 		     "  call call_rwsem_wake\n"
 		     "1:\n\t"
 		     "# ending __up_write\n"
-- 
cgit v1.2.3


From b4bcb4c28c64cc2876b4aef218d992ce806194da Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 20 Jul 2010 15:19:45 -0700
Subject: x86, rwsem: Minor cleanups

Clarified few comments and made initialization of %edx/%rdx more uniform
accross __down_write_nested, __up_read and __up_write functions.

Signed-off-by: Michel Lespinasse <walken@google.com>
LKML-Reference: <201007202219.o6KMJkiA021048@imap1.linux-foundation.org>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Mike Waychison <mikew@google.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/rwsem.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 5bf5e04e497..d1e41b0f9b6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
 {
 	asm volatile("# beginning down_read\n\t"
 		     LOCK_PREFIX _ASM_INC "(%1)\n\t"
-		     /* adds 0x00000001, returns the old value */
+		     /* adds 0x00000001 */
 		     "  jns        1f\n"
 		     "  call call_rwsem_down_read_failed\n"
 		     "1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	rwsem_count_t tmp;
-
-	tmp = RWSEM_ACTIVE_WRITE_BIAS;
 	asm volatile("# beginning down_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
-		     /* subtract 0x0000ffff, returns the old value */
+		     /* adds 0xffff0001, returns the old value */
 		     "  test      %1,%1\n\t"
 		     /* was the count 0 before? */
 		     "  jz        1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		     "1:\n"
 		     "# ending down_write"
 		     : "+m" (sem->count), "=d" (tmp)
-		     : "a" (sem), "1" (tmp)
+		     : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
 		     : "memory", "cc");
 }
 
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
  */
 static inline void __up_read(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
+	rwsem_count_t tmp;
 	asm volatile("# beginning __up_read\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 1, returns the old value */
 		     "  jns        1f\n\t"
-		     "  call call_rwsem_wake\n"
+		     "  call call_rwsem_wake\n" /* expects old value in %edx */
 		     "1:\n"
 		     "# ending __up_read\n"
 		     : "+m" (sem->count), "=d" (tmp)
-		     : "a" (sem), "1" (tmp)
+		     : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
 		     : "memory", "cc");
 }
 
@@ -218,7 +216,7 @@ static inline void __up_write(struct rw_semaphore *sem)
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 0xffff0001, returns the old value */
 		     "  jns        1f\n\t"
-		     "  call call_rwsem_wake\n"
+		     "  call call_rwsem_wake\n" /* expects old value in %edx */
 		     "1:\n\t"
 		     "# ending __up_write\n"
 		     : "+m" (sem->count), "=d" (tmp)
-- 
cgit v1.2.3


From 5edd19af18a36a4e22c570b1b969179e0ca1fe4c Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Tue, 20 Jul 2010 18:09:05 -0500
Subject: x86, UV: Make kdump avoid stack dumps

UV NMI callback's should not write stack dumps when a kdump is to be written.

When invoking the crash kernel to write a dump, kdump_nmi_shootdown_cpus()
uses NMI's to get all the cpu's to save their register context and halt.

But the NMI interrupt handler runs a callback list.  This patch sets a flag
to prevent any of those callbacks from interfering with the halt of the cpu.

For UV, which currently has the only callback to which this is relevant, the
uv_handle_nmi() callback should not do dumping of stacks.

The 'in_crash_kexec' flag is defined as an extern in kdebug.h firstly
because x2apic_uv_x.c includes it.  Secondly because some future callback
might need the flag to know that it should not enter the debugger.
(Such a scenario was in fact present in the 2.6.32 kernel, SuSE distribution,
 where a call to kdb needed to be avoided.)

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1ObLvt-0005UZ-Va@eag09.americas.sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/kdebug.h      | 1 +
 arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++++
 arch/x86/kernel/crash.c            | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b97476..7a2910b3512 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,5 +33,6 @@ extern void __show_regs(struct pt_regs *regs, int all);
 extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
+extern int in_crash_kexec;
 
 #endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e3..7b598b84c90 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
 	if (reason != DIE_NMI_IPI)
 		return NOTIFY_OK;
+
+	if (in_crash_kexec)
+		/* do nothing if entering the crash kernel */
+		return NOTIFY_OK;
 	/*
 	 * Use a lock so only one cpu prints at a time
 	 * to prevent intermixed output.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096..764c7c2b181 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
 #include <asm/reboot.h>
 #include <asm/virtext.h>
 
+int in_crash_kexec;
+
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 
 static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 
 static void kdump_nmi_shootdown_cpus(void)
 {
+	in_crash_kexec = 1;
 	nmi_shootdown_cpus(kdump_nmi_callback);
 
 	disable_local_APIC();
-- 
cgit v1.2.3


From 3f8afb77cd8a672f024e4a16763ef177bc16c8f8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 21 Jul 2010 14:47:05 +0200
Subject: x86, tlb: Clean up and correct used type

smp_processor_id() returns an int and not an unsigned long.
Also, since the function is small enough, there's no need for a
local variable caching its value.

No functionality change, just cleanup.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100721124705.GA674@aftab>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/tlb.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d..c03f14ab666 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 
 static void do_flush_tlb_all(void *info)
 {
-	unsigned long cpu = smp_processor_id();
-
 	__flush_tlb_all();
 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-		leave_mm(cpu);
+		leave_mm(smp_processor_id());
 }
 
 void flush_tlb_all(void)
-- 
cgit v1.2.3


From 0e49bf66d2ca649b167428adddbbbe9d9bd4894c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:52 +0200
Subject: x86, xsave: Separate fpu and xsave initialization

As xsave also supports other than fpu features, it should be
initialized independently of the fpu. This patch moves this out of fpu
initialization.

There is also a lot of cross referencing between fpu and xsave
code. This patch reduces this by making xsave_cntxt_init() and
init_thread_xstate() static functions.

The patch moves the cpu_has_xsave check at the beginning of
xsave_init(). All other checks may removed then.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-2-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  |  1 -
 arch/x86/include/asm/xsave.h |  1 -
 arch/x86/kernel/cpu/common.c |  2 ++
 arch/x86/kernel/i387.c       | 27 +++++++++++++++++++--------
 arch/x86/kernel/xsave.c      | 10 +++++-----
 5 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 59bd93ac7fe..509ddabeae2 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
 extern asmlinkage void math_state_restore(void);
 extern void __math_state_restore(void);
-extern void init_thread_xstate(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
 extern user_regset_active_fn fpregs_active, xfpregs_active;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 94d5f84d89f..4d3b5d1fc02 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -28,7 +28,6 @@ extern u64 pcntxt_mask;
 extern struct xsave_struct *init_xstate_buf;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-extern void xsave_cntxt_init(void);
 extern void xsave_init(void);
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 extern int init_fpu(struct task_struct *child);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 40561085d4f..94c36c7ac18 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1210,6 +1210,7 @@ void __cpuinit cpu_init(void)
 	dbg_restore_debug_regs();
 
 	fpu_init();
+	xsave_init();
 
 	raw_local_save_flags(kernel_eflags);
 
@@ -1270,6 +1271,7 @@ void __cpuinit cpu_init(void)
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
+	fpu_init();
 	xsave_init();
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2f32ef05f10..e73c54ebafc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
 	stts();
 }
 
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
 {
+	/*
+	 * Note that xstate_size might be overwriten later during
+	 * xsave_init().
+	 */
+
 	if (!HAVE_HWFP) {
 		xstate_size = sizeof(struct i387_soft_struct);
 		return;
 	}
 
-	if (cpu_has_xsave) {
-		xsave_cntxt_init();
-		return;
-	}
-
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
  */
+
 void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
@@ -93,14 +94,24 @@ void __cpuinit fpu_init(void)
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
-	xsave_init();
+	if (!smp_processor_id())
+		init_thread_xstate();
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
 	current_thread_info()->status = 0;
 	clear_used_math();
 }
-#endif	/* CONFIG_X86_64 */
+
+#else	/* CONFIG_X86_64 */
+
+void __cpuinit fpu_init(void)
+{
+	if (!smp_processor_id())
+		init_thread_xstate();
+}
+
+#endif	/* CONFIG_X86_32 */
 
 static void fpu_finit(struct fpu *fpu)
 {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index ab9ad48b653..550bf45236f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -362,9 +362,6 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
  */
 static void __cpuinit __xsave_init(void)
 {
-	if (!cpu_has_xsave)
-		return;
-
 	set_in_cr4(X86_CR4_OSXSAVE);
 
 	/*
@@ -429,7 +426,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-void __ref xsave_cntxt_init(void)
+static void __cpuinit xsave_cntxt_init(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -466,10 +463,13 @@ void __ref xsave_cntxt_init(void)
 
 void __cpuinit xsave_init(void)
 {
+	if (!cpu_has_xsave)
+		return;
+
 	/*
 	 * Boot processor to setup the FP and extended state context info.
 	 */
 	if (!smp_processor_id())
-		init_thread_xstate();
+		xsave_cntxt_init();
 	__xsave_init();
 }
-- 
cgit v1.2.3


From 97e80a70db689fb1e876df9f12305cc72f85ca53 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:53 +0200
Subject: x86, xsave: Introduce xstate enable functions

The patch renames xsave_cntxt_init() and __xsave_init() into
xstate_enable_boot_cpu() and xstate_enable() as this names are more
meaningful.

It also removes the duplicate xcr setup for the boot cpu.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-3-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 550bf45236f..2322f586c05 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,15 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-static void __cpuinit __xsave_init(void)
+static inline void xstate_enable(u64 mask)
 {
 	set_in_cr4(X86_CR4_OSXSAVE);
-
-	/*
-	 * Enable all the features that the HW is capable of
-	 * and the Linux kernel is aware of.
-	 */
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, mask);
 }
 
 /*
@@ -426,7 +421,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-static void __cpuinit xsave_cntxt_init(void)
+static void __cpuinit xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -443,7 +438,8 @@ static void __cpuinit xsave_cntxt_init(void)
 	 * Support only the state known to OS.
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-	__xsave_init();
+
+	xstate_enable(pcntxt_mask);
 
 	/*
 	 * Recompute the context size for enabled features
@@ -470,6 +466,7 @@ void __cpuinit xsave_init(void)
 	 * Boot processor to setup the FP and extended state context info.
 	 */
 	if (!smp_processor_id())
-		xsave_cntxt_init();
-	__xsave_init();
+		xstate_enable_boot_cpu();
+	else
+		xstate_enable(pcntxt_mask);
 }
-- 
cgit v1.2.3


From ee813d53a8e980a3a28318efb8935d45723f5211 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:54 +0200
Subject: x86, xsave: Check cpuid level for XSTATE_CPUID (0x0d)

The patch introduces the XSTATE_CPUID macro and adds a check that
tests if XSTATE_CPUID exists.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-4-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/xsave.h |  2 ++
 arch/x86/kernel/xsave.c      | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 4d3b5d1fc02..d1b5f3a2fa2 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -4,6 +4,8 @@
 #include <linux/types.h>
 #include <asm/processor.h>
 
+#define XSTATE_CPUID		0x0000000d
+
 #define XSTATE_FP	0x1
 #define XSTATE_SSE	0x2
 #define XSTATE_YMM	0x4
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 2322f586c05..5adb7fb408f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -379,7 +379,7 @@ static void setup_xstate_features(void)
 	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
 
 	do {
-		cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx);
+		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
 
 		if (eax == 0)
 			break;
@@ -425,7 +425,12 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
-	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+		return;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	pcntxt_mask = eax + ((u64)edx << 32);
 
 	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -444,7 +449,7 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	/*
 	 * Recompute the context size for enabled features
 	 */
-	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xstate_size = ebx;
 
 	update_regset_xstate_info(xstate_size, pcntxt_mask);
-- 
cgit v1.2.3


From 45c2d7f46211a0b1f6b425c59575c53145afc4b4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:55 +0200
Subject: x86, xsave: Make init_xstate_buf static

The pointer is only used in xsave.c. Making it static.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-5-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/xsave.h |  1 -
 arch/x86/kernel/xsave.c      | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d1b5f3a2fa2..0ae6b996198 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,7 +27,6 @@
 
 extern unsigned int xstate_size;
 extern u64 pcntxt_mask;
-extern struct xsave_struct *init_xstate_buf;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void xsave_init(void);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 5adb7fb408f..3b44a9b1eca 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,6 +16,11 @@
  */
 u64 pcntxt_mask;
 
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
+
 struct _fpx_sw_bytes fx_sw_reserved;
 #ifdef CONFIG_IA32_EMULATION
 struct _fpx_sw_bytes fx_sw_reserved_ia32;
@@ -348,11 +353,6 @@ static void prepare_fx_sw_frame(void)
 #endif
 }
 
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
 #ifdef CONFIG_X86_64
 unsigned int sig_xstate_size = sizeof(struct _fpstate);
 #endif
-- 
cgit v1.2.3


From 4995b9dba908436c1611454f9bd2cb3ddf6babee Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:56 +0200
Subject: x86, xsave: Add __init attribute to setup_xstate_features()

This is called only from initialization code.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-6-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 3b44a9b1eca..cfc7901ee94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -370,7 +370,7 @@ static inline void xstate_enable(u64 mask)
  * Record the offsets and sizes of different state managed by the xsave
  * memory layout.
  */
-static void setup_xstate_features(void)
+static void __init setup_xstate_features(void)
 {
 	int eax, ebx, ecx, edx, leaf = 0x2;
 
-- 
cgit v1.2.3


From 1cff92d8fdb27684308864d9cdb324bee43b40ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 21 Jul 2010 14:23:10 -0700
Subject: x86, xsave: Make xstate_enable_boot_cpu() __init, protect on CPU 0

xstate_enable_boot_cpu() is, as the name implies, only used on the
boot CPU; furthermore, it invokes alloc_bootmem(), which is __init;
hence it needs to be tagged __init rather than __cpuinit.

Furthermore, it is *not* safe in the long run to rely on CPU 0 only
coming online during the early boot -- at some point we're going to
support offlining (and re-onlining) the boot CPU, and at that point we
must not call xstate_enable_boot_cpu() again.

The code is a fair bit more obscure than one would like, because the
__ref overrides aren't quite powerful enough.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <4C476236.1020302@zytor.com>
---
 arch/x86/kernel/xsave.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cfc7901ee94..b2549c3eb2c 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,10 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-static inline void xstate_enable(u64 mask)
+static inline void xstate_enable(void)
 {
 	set_in_cr4(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, mask);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
 /*
@@ -421,7 +421,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-static void __cpuinit xstate_enable_boot_cpu(void)
+static void __init xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -444,7 +444,7 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
 
-	xstate_enable(pcntxt_mask);
+	xstate_enable();
 
 	/*
 	 * Recompute the context size for enabled features
@@ -462,16 +462,22 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	       pcntxt_mask, xstate_size);
 }
 
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
 void __cpuinit xsave_init(void)
 {
+	static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+	void (*this_func)(void);
+
 	if (!cpu_has_xsave)
 		return;
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		xstate_enable_boot_cpu();
-	else
-		xstate_enable(pcntxt_mask);
+	this_func = next_func;
+	next_func = xstate_enable;
+	this_func();
 }
-- 
cgit v1.2.3


From 8c06585d6431addadd94903843dfbcd315b42d4e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 17 Jul 2010 09:03:26 -0400
Subject: x86: Remove redundant K6 MSRs

MSR_K6_EFER is unused, and MSR_K6_STAR is redundant with MSR_STAR.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1279371808-24804-1-git-send-email-brgerst@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/msr-index.h | 2 --
 arch/x86/kvm/svm.c               | 6 +++---
 arch/x86/kvm/vmx.c               | 8 ++++----
 arch/x86/kvm/x86.c               | 2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae431862..6068e0e06e0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -159,8 +159,6 @@
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 
 /* K6 MSRs */
-#define MSR_K6_EFER			0xc0000080
-#define MSR_K6_STAR			0xc0000081
 #define MSR_K6_WHCR			0xc0000082
 #define MSR_K6_UWCCR			0xc0000085
 #define MSR_K6_EPMR			0xc0000086
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd2..24a22069629 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -130,7 +130,7 @@ static struct svm_direct_access_msrs {
 	u32 index;   /* Index of the MSR */
 	bool always; /* True if intercept is always on */
 } direct_access_msrs[] = {
-	{ .index = MSR_K6_STAR,				.always = true  },
+	{ .index = MSR_STAR,				.always = true  },
 	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
 #ifdef CONFIG_X86_64
 	{ .index = MSR_GS_BASE,				.always = true  },
@@ -2431,7 +2431,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 		*data = tsc_offset + native_read_tsc();
 		break;
 	}
-	case MSR_K6_STAR:
+	case MSR_STAR:
 		*data = svm->vmcb->save.star;
 		break;
 #ifdef CONFIG_X86_64
@@ -2555,7 +2555,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 		break;
 	}
-	case MSR_K6_STAR:
+	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe7..b42ad25d564 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -231,14 +231,14 @@ static u64 host_efer;
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 /*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
  * away by decrementing the array size.
  */
 static const u32 vmx_msr_index[] = {
 #ifdef CONFIG_X86_64
 	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
-	MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
@@ -1057,10 +1057,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		if (index >= 0 && vmx->rdtscp_enabled)
 			move_msr_up(vmx, index, save_nmsrs++);
 		/*
-		 * MSR_K6_STAR is only needed on long mode guests, and only
+		 * MSR_STAR is only needed on long mode guests, and only
 		 * if efer.sce is enabled.
 		 */
-		index = __find_msr_index(vmx, MSR_K6_STAR);
+		index = __find_msr_index(vmx, MSR_STAR);
 		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
 			move_msr_up(vmx, index, save_nmsrs++);
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05d571f6f19..6127468ebbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -671,7 +671,7 @@ static u32 msrs_to_save[] = {
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-	MSR_K6_STAR,
+	MSR_STAR,
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-- 
cgit v1.2.3


From cfaa71ee9794472598d3966c3315cd6bd8f953d3 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 17 Jul 2010 09:03:27 -0400
Subject: x86: Use symbolic MSR names

Use symbolic MSR names instead of hardcoding the MSR index.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1279371808-24804-2-git-send-email-brgerst@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/realmode/wakeup.S | 2 +-
 arch/x86/kernel/verify_cpu_64.S        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e29601..28595d6df47 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
 	movl	%eax, %ecx
 	orl	%edx, %ecx
 	jz	1f
-	movl	$0xc0000080, %ecx
+	movl	$MSR_EFER, %ecx
 	wrmsr
 1:
 
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a..56a8c2a867d 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
  */
 
 #include <asm/cpufeature.h>
+#include <asm/msr-index.h>
 
 verify_cpu:
 	pushfl				# Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
 	je	verify_cpu_sse_ok
 	test	%di,%di
 	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
-	movl	$0xc0010015,%ecx	# HWCR
+	movl	$MSR_K7_HWCR,%ecx
 	rdmsr
 	btr	$15,%eax		# enable SSE
 	wrmsr
-- 
cgit v1.2.3


From 650fb4393dff543bc980d361555c489fbdeed088 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 17 Jul 2010 09:03:28 -0400
Subject: x86-64: Simplify loading initial_gs

Load initial_gs as two 32-bit values instead of splitting a 64-bit value.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1279371808-24804-3-git-send-email-brgerst@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_64.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a..239046bd447 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
 	 * init data section till per cpu areas are set up.
 	 */
 	movl	$MSR_GS_BASE,%ecx
-	movq	initial_gs(%rip),%rax
-	movq    %rax,%rdx
-	shrq	$32,%rdx
+	movl	initial_gs(%rip),%eax
+	movl	initial_gs+4(%rip),%edx
 	wrmsr	
 
 	/* esi is pointer to real mode structure with interesting info.
-- 
cgit v1.2.3


From 4c21adf26f8fcf86a755b9b9f55c2e9fd241e1fb Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Jul 2010 16:59:34 -0700
Subject: x86 cpufreq, perf: Make trace_power_frequency cpufreq driver
 independent

and fix the broken case if a core's frequency depends on others.

trace_power_frequency was only implemented in a rather ungeneric
way in acpi-cpufreq driver's target() function only.

-> Move the call to trace_power_frequency to
   cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE
   notifier is triggered.
   This will support power frequency tracing by all cpufreq
   drivers.

trace_power_frequency did not trace frequency changes correctly
when the userspace governor was used or when CPU cores'
frequency depend on each other.

-> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu
   which gets switched automatically fixes this.

Robert Schoene provided some important fixes on top of my
initial quick shot version which are integrated in this patch:
- Forgot some changes in power_end trace (TP_printk/variable names)
- Variable dummy in power_end must now be cpu_id
- Use static 64 bit variable instead of unsigned int for cpu_id

[akpm@linux-foundation.org: build fix]
Signed-off-by: Thomas Renninger <trenn@suse.de>
Cc: davej@codemonkey.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Dave Jones <davej@codemonkey.org.uk>
Acked-by: Arjan van de Ven <arjan@infradead.org>
Cc: Robert Schoene <robert.schoene@tu-dresden.de>
Tested-by: Robert Schoene <robert.schoene@tu-dresden.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 ---
 arch/x86/kernel/process.c                  | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40e..cee5263927c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
-#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..787572d43d9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -371,7 +371,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	trace_power_start(POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 static void mwait_idle(void)
 {
 	if (!need_resched()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -478,7 +478,7 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	trace_power_start(POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
-- 
cgit v1.2.3


From 18f19aa62a267f2f759e278018f1032adf4c3774 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 14 May 2010 12:38:24 +0100
Subject: xen: Add support for HVM hypercalls.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/include/asm/xen/hypercall.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa..7fda040a76c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
 	return _hypercall2(int, nmi_op, op, arg);
 }
 
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+       return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
-- 
cgit v1.2.3


From bee6ab53e652a414af20392899879b58cd80d033 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Fri, 14 May 2010 12:39:33 +0100
Subject: x86: early PV on HVM features initialization.

Initialize basic pv on hvm features adding a new Xen HVM specific
hypervisor_x86 structure.

Don't try to initialize xen-kbdfront and xen-fbfront when running on HVM
because the backends are not available.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/hypervisor.h |   1 +
 arch/x86/kernel/cpu/hypervisor.c  |   1 +
 arch/x86/xen/enlighten.c          | 100 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c..ff2546ce717 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
 /* Recognized hypervisors */
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
 
 #endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8..bffd47c10fe 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+	&x86_hyper_xen_hvm,
 };
 
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..09b36e9d507 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -35,6 +35,7 @@
 #include <xen/interface/version.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvc-console.h>
@@ -55,7 +56,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
+#include <asm/setup.h>
 #include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -76,6 +79,8 @@ struct shared_info xen_dummy_shared_info;
 
 void *xen_initial_gdt;
 
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -1206,3 +1211,98 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
+
+static uint32_t xen_cpuid_base(void)
+{
+	uint32_t base, eax, ebx, ecx, edx;
+	char signature[13];
+
+	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+		cpuid(base, &eax, &ebx, &ecx, &edx);
+		*(uint32_t *)(signature + 0) = ebx;
+		*(uint32_t *)(signature + 4) = ecx;
+		*(uint32_t *)(signature + 8) = edx;
+		signature[12] = 0;
+
+		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+			return base;
+	}
+
+	return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+	uint32_t eax, ebx, ecx, edx, pages, msr, base;
+	u64 pfn;
+
+	base = xen_cpuid_base();
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	*major = eax >> 16;
+	*minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	xen_setup_features();
+
+	pv_info = xen_info;
+	pv_info.kernel_rpl = 0;
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+
+	return 0;
+}
+
+static void __init init_shared_info(void)
+{
+	struct xen_add_to_physmap xatp;
+	struct shared_info *shared_info_page;
+
+	shared_info_page = (struct shared_info *)
+		extend_brk(PAGE_SIZE, PAGE_SIZE);
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+
+	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+}
+
+static void __init xen_hvm_guest_init(void)
+{
+	int r;
+	int major, minor;
+
+	r = init_hvm_pv_info(&major, &minor);
+	if (r < 0)
+		return;
+
+	init_shared_info();
+}
+
+static bool __init xen_hvm_platform(void)
+{
+	if (xen_pv_domain())
+		return false;
+
+	if (!xen_cpuid_base())
+		return false;
+
+	return true;
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+	.name			= "Xen HVM",
+	.detect			= xen_hvm_platform,
+	.init_platform		= xen_hvm_guest_init,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
-- 
cgit v1.2.3


From 38e20b07efd541a959de367dc90a17f92ce2e8a6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Fri, 14 May 2010 12:40:51 +0100
Subject: x86/xen: event channels delivery on HVM.

Set the callback to receive evtchns from Xen, using the
callback vector delivery mechanism.

The traditional way for receiving event channel notifications from Xen
is via the interrupts from the platform PCI device.
The callback vector is a newer alternative that allow us to receive
notifications on any vcpu and doesn't need any PCI support: we allocate
a vector exclusively to receive events, in the vector handler we don't
need to interact with the vlapic, therefore we avoid a VMEXIT.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/irq_vectors.h |  3 +++
 arch/x86/kernel/entry_32.S         |  3 +++
 arch/x86/kernel/entry_64.S         |  3 +++
 arch/x86/xen/enlighten.c           | 28 ++++++++++++++++++++++++++++
 arch/x86/xen/xen-ops.h             |  2 ++
 5 files changed, 39 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f6..e2ca3009255 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
  */
 #define MCE_SELF_VECTOR			0xeb
 
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK		0xe9
+
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..6b196834a0d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1166,6 +1166,9 @@ ENTRY(xen_failsafe_callback)
 .previous
 ENDPROC(xen_failsafe_callback)
 
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+		xen_evtchn_do_upcall)
+
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff13983..490ae2bb18a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
 	CFI_ENDPROC
 END(xen_failsafe_callback)
 
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+	xen_hvm_callback_vector xen_evtchn_do_upcall
+
 #endif /* CONFIG_XEN */
 
 /*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 09b36e9d507..b211a04c4b2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -38,6 +39,7 @@
 #include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvm.h>
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
@@ -80,6 +82,8 @@ struct shared_info xen_dummy_shared_info;
 void *xen_initial_gdt;
 
 RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 
 /*
  * Point at some empty memory to start with. We map the real shared_info
@@ -1277,6 +1281,24 @@ static void __init init_shared_info(void)
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 }
 
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+	.notifier_call	= xen_hvm_cpu_notify,
+};
+
 static void __init xen_hvm_guest_init(void)
 {
 	int r;
@@ -1287,6 +1309,12 @@ static void __init xen_hvm_guest_init(void)
 		return;
 
 	init_shared_info();
+
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_have_vector_callback = 1;
+	register_cpu_notifier(&xen_hvm_cpu_notifier);
+	have_vcpu_info_placement = 0;
+	x86_init.irqs.intr_init = xen_init_IRQ;
 }
 
 static bool __init xen_hvm_platform(void)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bc..0d0e0e6a747 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,8 @@ void xen_enable_sysenter(void);
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
+void xen_callback_vector(void);
+
 void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_init_irq_ops(void);
-- 
cgit v1.2.3


From 016b6f5fe8398b0291cece60b749d7c930a2e09c Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 14 May 2010 12:45:07 +0100
Subject: xen: Add suspend/resume support for PV on HVM guests.

Suspend/resume requires few different things on HVM: the suspend
hypercall is different; we don't need to save/restore memory related
settings; except the shared info page and the callback mechanism.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 24 ++++++++++++++++++------
 arch/x86/xen/suspend.c   |  6 ++++++
 arch/x86/xen/xen-ops.h   |  1 +
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b211a04c4b2..127c95c8d15 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1262,13 +1262,15 @@ static int init_hvm_pv_info(int *major, int *minor)
 	return 0;
 }
 
-static void __init init_shared_info(void)
+void xen_hvm_init_shared_info(void)
 {
+	int cpu;
 	struct xen_add_to_physmap xatp;
-	struct shared_info *shared_info_page;
+	static struct shared_info *shared_info_page = 0;
 
-	shared_info_page = (struct shared_info *)
-		extend_brk(PAGE_SIZE, PAGE_SIZE);
+	if (!shared_info_page)
+		shared_info_page = (struct shared_info *)
+			extend_brk(PAGE_SIZE, PAGE_SIZE);
 	xatp.domid = DOMID_SELF;
 	xatp.idx = 0;
 	xatp.space = XENMAPSPACE_shared_info;
@@ -1278,7 +1280,17 @@ static void __init init_shared_info(void)
 
 	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
 
-	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+	 * page, we use it in the event channel upcall and in some pvclock
+	 * related functions. We don't need the vcpu_info placement
+	 * optimizations because we don't use any pv_mmu or pv_irq op on
+	 * HVM.
+	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+	 * online but xen_hvm_init_shared_info is run at resume time too and
+	 * in that case multiple vcpus might be online. */
+	for_each_online_cpu(cpu) {
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+	}
 }
 
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
@@ -1308,7 +1320,7 @@ static void __init xen_hvm_guest_init(void)
 	if (r < 0)
 		return;
 
-	init_shared_info();
+	xen_hvm_init_shared_info();
 
 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c66110803..d07479c340f 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,12 @@ void xen_pre_suspend(void)
 		BUG();
 }
 
+void xen_hvm_post_suspend(int suspend_cancelled)
+{
+	xen_hvm_init_shared_info();
+	xen_callback_vector();
+}
+
 void xen_post_suspend(int suspend_cancelled)
 {
 	xen_build_mfn_list_list();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d0e0e6a747..01c9dd38652 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -39,6 +39,7 @@ void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
 void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
 
-- 
cgit v1.2.3


From 409771d258e9dd71c30f3c9520fd2b796ffc40f0 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 14 May 2010 12:48:19 +0100
Subject: x86: Use xen_vcpuop_clockevent, xen_clocksource and xen wallclock.

Use xen_vcpuop_clockevent instead of hpet and APIC timers as main
clockevent device on all vcpus, use the xen wallclock time as wallclock
instead of rtc and use xen_clocksource as clocksource.
The pv clock algorithm needs to work correctly for the xen_clocksource
and xen wallclock to be usable, only modern Xen versions offer a
reliable pv clock in HVM guests (XENFEAT_hvm_safe_pvclock).

Using the hpet as clocksource means a VMEXIT every time we read/write to
the hpet mmio addresses, pvclock give us a better rating without
VMEXITs. Same goes for the xen wallclock and xen_vcpuop_clockevent

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Don Dutile <ddutile@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 14 ++----------
 arch/x86/xen/suspend.c   |  6 +++++
 arch/x86/xen/time.c      | 58 +++++++++++++++++++++++++++++++++++++++++++-----
 arch/x86/xen/xen-ops.h   |  7 ++----
 4 files changed, 63 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 127c95c8d15..a9017296388 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -935,10 +935,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 };
 
-static const struct pv_time_ops xen_time_ops __initdata = {
-	.sched_clock = xen_sched_clock,
-};
-
 static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.cpuid = xen_cpuid,
 
@@ -1076,7 +1072,6 @@ asmlinkage void __init xen_start_kernel(void)
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_time_ops = xen_time_ops;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_apic_ops = xen_apic_ops;
 
@@ -1084,13 +1079,7 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;
 
-	x86_init.timers.timer_init = xen_time_init;
-	x86_init.timers.setup_percpu_clockev = x86_init_noop;
-	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
-	x86_platform.calibrate_tsc = xen_tsc_khz;
-	x86_platform.get_wallclock = xen_get_wallclock;
-	x86_platform.set_wallclock = xen_set_wallclock;
+	xen_init_time_ops();
 
 	/*
 	 * Set up some pagetable state before starting to set any ptes.
@@ -1327,6 +1316,7 @@ static void __init xen_hvm_guest_init(void)
 	register_cpu_notifier(&xen_hvm_cpu_notifier);
 	have_vcpu_info_placement = 0;
 	x86_init.irqs.intr_init = xen_init_IRQ;
+	xen_hvm_init_time_ops();
 }
 
 static bool __init xen_hvm_platform(void)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index d07479c340f..1d789d56877 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -28,8 +28,14 @@ void xen_pre_suspend(void)
 
 void xen_hvm_post_suspend(int suspend_cancelled)
 {
+	int cpu;
 	xen_hvm_init_shared_info();
 	xen_callback_vector();
+	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+		for_each_online_cpu(cpu) {
+			xen_setup_runstate_info(cpu);
+		}
+	}
 }
 
 void xen_post_suspend(int suspend_cancelled)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed30..4780e55886a 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
 #include <asm/xen/hypercall.h>
 
 #include <xen/events.h>
+#include <xen/features.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
 
@@ -160,7 +161,7 @@ static void do_stolen_accounting(void)
  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
  * states.
  */
-unsigned long long xen_sched_clock(void)
+static unsigned long long xen_sched_clock(void)
 {
 	struct vcpu_runstate_info state;
 	cycle_t now;
@@ -195,7 +196,7 @@ unsigned long long xen_sched_clock(void)
 
 
 /* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
 {
 	struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +231,7 @@ static void xen_read_wallclock(struct timespec *ts)
 	put_cpu_var(xen_vcpu);
 }
 
-unsigned long xen_get_wallclock(void)
+static unsigned long xen_get_wallclock(void)
 {
 	struct timespec ts;
 
@@ -238,7 +239,7 @@ unsigned long xen_get_wallclock(void)
 	return ts.tv_sec;
 }
 
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(unsigned long now)
 {
 	/* do nothing for domU */
 	return -1;
@@ -473,7 +474,11 @@ void xen_timer_resume(void)
 	}
 }
 
-__init void xen_time_init(void)
+static const struct pv_time_ops xen_time_ops __initdata = {
+	.sched_clock = xen_sched_clock,
+};
+
+static __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
 	struct timespec tp;
@@ -497,3 +502,46 @@ __init void xen_time_init(void)
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
 }
+
+__init void xen_init_time_ops(void)
+{
+	pv_time_ops = xen_time_ops;
+
+	x86_init.timers.timer_init = xen_time_init;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
+	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+	x86_platform.calibrate_tsc = xen_tsc_khz;
+	x86_platform.get_wallclock = xen_get_wallclock;
+	x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+	int cpu = smp_processor_id();
+	xen_setup_runstate_info(cpu);
+	xen_setup_timer(cpu);
+	xen_setup_cpu_clockevents();
+}
+
+__init void xen_hvm_init_time_ops(void)
+{
+	/* vector callback is needed otherwise we cannot receive interrupts
+	 * on cpu > 0 */
+	if (!xen_have_vector_callback && num_present_cpus() > 1)
+		return;
+	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+				"disable pv timer\n");
+		return;
+	}
+
+	pv_time_ops = xen_time_ops;
+	x86_init.timers.setup_percpu_clockev = xen_time_init;
+	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+	x86_platform.calibrate_tsc = xen_tsc_khz;
+	x86_platform.get_wallclock = xen_get_wallclock;
+	x86_platform.set_wallclock = xen_set_wallclock;
+}
+
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 01c9dd38652..089d18923d2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -49,11 +49,8 @@ void xen_setup_runstate_info(int cpu);
 void xen_teardown_timer(int cpu);
 cycle_t xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
-- 
cgit v1.2.3


From c1c5413ad58cb73267d328e6020268aa2e50d8ca Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 14 May 2010 12:44:30 +0100
Subject: x86: Unplug emulated disks and nics.

Add a xen_emul_unplug command line option to the kernel to unplug
xen emulated disks and nics.

Set the default value of xen_emul_unplug depending on whether or
not the Xen PV frontends and the Xen platform PCI driver have
been compiled for this kernel (modules or built-in are both OK).

The user can specify xen_emul_unplug=ignore to enable PV drivers on HVM
even if the host platform doesn't support unplug.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/Makefile              |   2 +-
 arch/x86/xen/enlighten.c           |   1 +
 arch/x86/xen/platform-pci-unplug.c | 135 +++++++++++++++++++++++++++++++++++++
 arch/x86/xen/xen-ops.h             |   1 +
 4 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/xen/platform-pci-unplug.c

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f..93095468598 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,7 @@ CFLAGS_mmu.o			:= $(nostackp)
 
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
-			grant-table.o suspend.o
+			grant-table.o suspend.o platform-pci-unplug.o
 
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a9017296388..157c93b62dd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1314,6 +1314,7 @@ static void __init xen_hvm_guest_init(void)
 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
 	register_cpu_notifier(&xen_hvm_cpu_notifier);
+	xen_unplug_emulated_devices();
 	have_vcpu_info_placement = 0;
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 00000000000..2f7f3fb3477
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,135 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+/* store the value of xen_emul_unplug after the unplug is done */
+int xen_platform_pci_unplug;
+EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+static int xen_emul_unplug;
+
+static int __init check_platform_magic(void)
+{
+	short magic;
+	char protocol;
+
+	magic = inw(XEN_IOPORT_MAGIC);
+	if (magic != XEN_IOPORT_MAGIC_VAL) {
+		printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+		return XEN_PLATFORM_ERR_MAGIC;
+	}
+
+	protocol = inb(XEN_IOPORT_PROTOVER);
+
+	printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+			protocol);
+
+	switch (protocol) {
+	case 1:
+		outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+		outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+		if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+			printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+			return XEN_PLATFORM_ERR_BLACKLIST;
+		}
+		break;
+	default:
+		printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+		return XEN_PLATFORM_ERR_PROTOCOL;
+	}
+
+	return 0;
+}
+
+void __init xen_unplug_emulated_devices(void)
+{
+	int r;
+
+	/* check the version of the xen platform PCI device */
+	r = check_platform_magic();
+	/* If the version matches enable the Xen platform PCI driver.
+	 * Also enable the Xen platform PCI driver if the version is really old
+	 * and the user told us to ignore it. */
+	if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+			(xen_emul_unplug & XEN_UNPLUG_IGNORE)))
+		return;
+	/* Set the default value of xen_emul_unplug depending on whether or
+	 * not the Xen PV frontends and the Xen platform PCI driver have
+	 * been compiled for this kernel (modules or built-in are both OK). */
+	if (!xen_emul_unplug) {
+		if (xen_must_unplug_nics()) {
+			printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+					"been compiled for this kernel: unplug emulated NICs.\n");
+			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+		}
+		if (xen_must_unplug_disks()) {
+			printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+					"been compiled for this kernel: unplug emulated disks.\n"
+					"You might have to change the root device\n"
+					"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+					"in your root= kernel command line option\n");
+			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+		}
+	}
+	/* Now unplug the emulated devices */
+	if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE))
+		outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+	xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+	char *p, *q;
+	int l;
+
+	for (p = arg; p; p = q) {
+		q = strchr(p, ',');
+		if (q) {
+			l = q - p;
+			q++;
+		} else {
+			l = strlen(p);
+		}
+		if (!strncmp(p, "all", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL;
+		else if (!strncmp(p, "ide-disks", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+		else if (!strncmp(p, "aux-ide-disks", l))
+			xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+		else if (!strncmp(p, "nics", l))
+			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+		else if (!strncmp(p, "ignore", l))
+			xen_emul_unplug |= XEN_UNPLUG_IGNORE;
+		else
+			printk(KERN_WARNING "unrecognised option '%s' "
+				 "in parameter 'xen_emul_unplug'\n", p);
+	}
+	return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 089d18923d2..ed776949024 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -40,6 +40,7 @@ void xen_vcpu_restore(void);
 
 void xen_callback_vector(void);
 void xen_hvm_init_shared_info(void);
+void __init xen_unplug_emulated_devices(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
 
-- 
cgit v1.2.3


From 5915100106b8f14a38053ad6c03a664d208aeaa2 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 17 Jun 2010 14:22:52 +0100
Subject: x86: Call HVMOP_pagetable_dying on exit_mmap.

When a pagetable is about to be destroyed, we notify Xen so that the
hypervisor can clear the related shadow pagetable.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c |  1 +
 arch/x86/xen/mmu.c       | 33 +++++++++++++++++++++++++++++++++
 arch/x86/xen/mmu.h       |  1 +
 3 files changed, 35 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 157c93b62dd..75b479a684f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1318,6 +1318,7 @@ static void __init xen_hvm_guest_init(void)
 	have_vcpu_info_placement = 0;
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
+	xen_hvm_init_mmu_ops();
 }
 
 static bool __init xen_hvm_platform(void)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce..84648c1bf13 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,6 +58,7 @@
 
 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
 #include <xen/interface/version.h>
 #include <xen/hvc-console.h>
 
@@ -1941,6 +1942,38 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 }
 
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc;
+
+	a.domid = DOMID_SELF;
+	a.gpa = __pa(mm->pgd);
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+	struct xen_hvm_pagetable_dying a;
+	int rc = 0;
+
+	a.domid = DOMID_SELF;
+	a.gpa = 0x00;
+	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+	if (rc < 0) {
+		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+		return 0;
+	}
+	return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+	if (is_pagetable_dying_supported())
+		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+}
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ec..fa938c4aa2f 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -60,4 +60,5 @@ void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 unsigned long xen_read_cr2_direct(void);
 
 extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
 #endif	/* _XEN_MMU_H */
-- 
cgit v1.2.3


From b43275d661baa5f1f72dacd9033d6eda09d9fe87 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 26 Jul 2010 10:38:45 -0700
Subject: xen/pvhvm: fix build problem when !CONFIG_XEN

x86_hyper_xen_hvm is only defined when Xen is enabled in the kernel
config.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index bffd47c10fe..5bccedcb912 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,7 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN
 	&x86_hyper_xen_hvm,
+#endif
 };
 
 const struct hypervisor_x86 *x86_hyper;
-- 
cgit v1.2.3


From 8c73626ab28527b7eb7f3061c027fbfe530c488c Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:18 -0700
Subject: x86: Fix vtime/file timestamp inconsistencies

Due to vtime calling vgettimeofday(), its possible that an application
could call  time();create("stuff",O_RDRW);  only to see the file's
creation timestamp to be before the value returned by time.

A similar way to reproduce the issue is to compare the vsyscall time()
with the syscall time(), and observe ordering issues.

The modified test case from Oleg Nesterov below can illustrate this:

int main(void)
{
	time_t sec1,sec2;
	do {
		sec1 = time(&sec2);
		sec2 = syscall(__NR_time, NULL);
	} while (sec1 <= sec2);

	printf("vtime: %d.000000\n", sec1);
	printf("time: %d.000000\n", sec2);
	return 0;
}

The proper fix is to make vtime use the same time value as
current_kernel_time() (which is exported via update_vsyscall) instead of
vgettime().

Thanks to Jiri Olsa for bringing up the issue and catching bugs in
earlier verisons of this fix.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <1279068988-21864-2-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60..dce0c3c5a78 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
-	struct timeval tv;
+	unsigned seq;
 	time_t result;
 	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
 		return time_syscall(t);
 
-	vgettimeofday(&tv, NULL);
-	result = tv.tv_sec;
+	do {
+		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+		result = __vsyscall_gtod_data.wall_time_sec;
+
+	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
 	if (t)
 		*t = result;
 	return result;
-- 
cgit v1.2.3


From 592913ecb87a9e06f98ddb55b298f1a66bf94c6b Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:20 -0700
Subject: time: Kill off CONFIG_GENERIC_TIME

Now that all arches have been converted over to use generic time via
clocksources or arch_gettimeoffset(), we can remove the GENERIC_TIME
config option and simplify the generic code.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-4-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a6..546b610ad71 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -72,9 +72,6 @@ config ARCH_DEFCONFIG
 	default "arch/x86/configs/i386_defconfig" if X86_32
 	default "arch/x86/configs/x86_64_defconfig" if X86_64
 
-config GENERIC_TIME
-	def_bool y
-
 config GENERIC_CMOS_UPDATE
 	def_bool y
 
@@ -2046,7 +2043,7 @@ config SCx200
 
 config SCx200HR_TIMER
 	tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
-	depends on SCx200 && GENERIC_TIME
+	depends on SCx200
 	default y
 	---help---
 	  This driver provides a clocksource built upon the on-chip
-- 
cgit v1.2.3


From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:23 -0700
Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset

update_vsyscall() did not provide the wall_to_monotoinc offset,
so arch specific implementations tend to reference wall_to_monotonic
directly. This limits future cleanups in the timekeeping core, so
this patch fixes the update_vsyscall interface to provide
wall_to_monotonic, allowing wall_to_monotonic to be made static
as planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dce0c3c5a78..dcbb28c4b69 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	unsigned long flags;
 
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
-- 
cgit v1.2.3


From f12a15be63d1de9a35971f35f06b73088fa25c3a Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:27 -0700
Subject: x86: Convert common clocksources to use clocksource_register_hz/khz

This converts the most common of the x86 clocksources over to use
clocksource_register_hz/khz.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-11-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 13 +++++++++----
 arch/x86/kernel/tsc.c  |  5 +----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d73117..33dbcc4ec5f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
 #include <asm/hpet.h>
 
 #define HPET_MASK			CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT			22
 
 /* FSEC = 10^-15
    NSEC = 10^-9 */
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
 	.rating		= 250,
 	.read		= read_hpet,
 	.mask		= HPET_MASK,
-	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
+	u64 hpet_freq;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
 	 *  mult = (hpet_period * 2^shift)/10^6
 	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
 	 */
-	clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
 
-	clocksource_register(&clocksource_hpet);
+	/* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+	 *
+	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+	 */
+	hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+	do_div(hpet_freq, hpet_period);
+	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae184..ce8e5023933 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
 	.read                   = read_tsc,
 	.resume			= resume_tsc,
 	.mask                   = CLOCKSOURCE_MASK(64),
-	.shift                  = 22,
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
 
 static void __init init_tsc_clocksource(void)
 {
-	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-			clocksource_tsc.shift);
 	if (tsc_clocksource_reliable)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
 	/* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	}
-	clocksource_register(&clocksource_tsc);
+	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From 80a506b8fdcfa868bb53eb740f928217d0966fc1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 17:14:24 +0200
Subject: x86/amd-iommu: Export cache-coherency capability

This patch exports the capability of the AMD IOMMU to force
cache coherency of DMA transactions through the IOMMU-API.
This is required to disable some nasty hacks in KVM when
this capability is not available.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 29dd3b9f2f0..fa044e1e30a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
 				    unsigned long cap)
 {
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d2cb214551de8180542a04ec8c86c0c9412c5124 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 26 Mar 2010 15:37:50 -0700
Subject: xen/mmu: inhibit vmap aliases rather than trying to clear them out

Rather than trying to deal with aliases once they appear, just completely
inhibit them.  Mostly the removal of aliases was managable, but it comes
unstuck in xen_create_contiguous_region() because it gets executed at
interrupt time (as a result of dma_alloc_coherent()), which causes all
sorts of confusion in the vmap code, as it was never intended to be run
in interrupt context.

This has the unfortunate side effect of removing all the unmap batching
the vmap code so carefully added, but that can't be helped.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index eb51402dd99..ef5728dde8f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/bug.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
 
@@ -1015,8 +1016,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
    read-only, and can be pinned. */
 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
-	vm_unmap_aliases();
-
 	xen_mc_batch();
 
 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1580,7 +1579,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
 	if (PagePinned(virt_to_page(mm->pgd))) {
 		SetPagePinned(page);
 
-		vm_unmap_aliases();
 		if (!PageHighMem(page)) {
 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -2026,6 +2024,8 @@ void __init xen_init_mmu_ops(void)
 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
 	pv_mmu_ops = xen_mmu_ops;
+
+	vmap_lazy_unmap = false;
 }
 
 /* Protected by xen_reservation_lock. */
@@ -2165,8 +2165,6 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
 
 	memset((void *) vstart, 0, PAGE_SIZE << order);
 
-	vm_unmap_aliases();
-
 	spin_lock_irqsave(&xen_reservation_lock, flags);
 
 	/* 1. Zap current PTEs, remembering MFNs. */
@@ -2204,8 +2202,6 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
 
 	memset((void *) vstart, 0, PAGE_SIZE << order);
 
-	vm_unmap_aliases();
-
 	spin_lock_irqsave(&xen_reservation_lock, flags);
 
 	/* 1. Find start MFN of contiguous extent. */
-- 
cgit v1.2.3


From bbbe57386e857eb2a8d4abcae71063c819c06ff1 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 9 Feb 2010 14:30:55 -0500
Subject: pci-swiotlb-xen: Add glue code to setup dma_ops utilizing
 xen_swiotlb_* functions.

We add the glue code that sets up a dma_ops structure with the
xen_swiotlb_* functions. The code turns on xen_swiotlb flag
when it detects it is running under Xen and it is either
in privileged mode or the iommu=soft flag was passed in.

It also disables the bare-metal SWIOTLB if the Xen-SWIOTLB has
been enabled.

Note: The Xen-SWIOTLB is only built when CONFIG_XEN is enabled.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Albert Herranz <albert_herranz@yahoo.es>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
---
 arch/x86/include/asm/xen/swiotlb-xen.h | 14 ++++++++
 arch/x86/xen/Makefile                  |  1 +
 arch/x86/xen/pci-swiotlb-xen.c         | 58 ++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)
 create mode 100644 arch/x86/include/asm/xen/swiotlb-xen.h
 create mode 100644 arch/x86/xen/pci-swiotlb-xen.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 00000000000..1be1ab7d6a4
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+#ifdef CONFIG_SWIOTLB_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+#else
+#define xen_swiotlb (0)
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+#endif
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f..32af238055c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 
+obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 00000000000..a013ec9d0c5
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,58 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+	.mapping_error = xen_swiotlb_dma_mapping_error,
+	.alloc_coherent = xen_swiotlb_alloc_coherent,
+	.free_coherent = xen_swiotlb_free_coherent,
+	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = xen_swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+	.map_sg = xen_swiotlb_map_sg_attrs,
+	.unmap_sg = xen_swiotlb_unmap_sg_attrs,
+	.map_page = xen_swiotlb_map_page,
+	.unmap_page = xen_swiotlb_unmap_page,
+	.dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+	/* If running as PV guest, either iommu=soft, or swiotlb=force will
+	 * activate this IOMMU. If running as PV privileged, activate it
+	 * irregardlesss.
+	 */
+	if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
+	    (xen_pv_domain()))
+		xen_swiotlb = 1;
+
+	/* If we are running under Xen, we MUST disable the native SWIOTLB.
+	 * Don't worry about swiotlb_force flag activating the native, as
+	 * the 'swiotlb' flag is the only one turning it on. */
+	if (xen_pv_domain())
+		swiotlb = 0;
+
+	return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+	if (xen_swiotlb) {
+		xen_swiotlb_init(1);
+		dma_ops = &xen_swiotlb_dma_ops;
+	}
+}
-- 
cgit v1.2.3


From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 27 Jul 2010 17:01:49 -0700
Subject: x86: Add memory modify constraints to xchg() and cmpxchg()

xchg() and cmpxchg() modify their memory operands, not merely read
them.  For some versions of gcc the "memory" clobber has apparently
dealt with the situation, but not for all.

Originally-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Glauber Costa <glommer@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Palfrader <peter@palfrader.org>
Cc: Greg KH <gregkh@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: <stable@kernel.org>
LKML-Reference: <4C4F7277.8050306@zytor.com>
---
 arch/x86/include/asm/cmpxchg_32.h | 68 +++++++++++++++++++--------------------
 arch/x86/include/asm/cmpxchg_64.h | 40 +++++++++++------------
 2 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3c..c1cf59d72f0 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -27,20 +27,20 @@ struct __xchg_dummy {
 	switch (size) {							\
 	case 1:								\
 		asm volatile("xchgb %b0,%1"				\
-			     : "=q" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=q" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	case 2:								\
 		asm volatile("xchgw %w0,%1"				\
-			     : "=r" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	case 4:								\
 		asm volatile("xchgl %0,%1"				\
-			     : "=r" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	default:							\
@@ -70,14 +70,14 @@ static inline void __set_64bit(unsigned long long *ptr,
 			       unsigned int low, unsigned int high)
 {
 	asm volatile("\n1:\t"
-		     "movl (%0), %%eax\n\t"
-		     "movl 4(%0), %%edx\n\t"
-		     LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+		     "movl (%1), %%eax\n\t"
+		     "movl 4(%1), %%edx\n\t"
+		     LOCK_PREFIX "cmpxchg8b (%1)\n\t"
 		     "jnz 1b"
-		     : /* no outputs */
-		     : "D"(ptr),
-		       "b"(low),
-		       "c"(high)
+		     : "=m" (*ptr)
+		     : "D" (ptr),
+		       "b" (low),
+		       "c" (high)
 		     : "ax", "dx", "memory");
 }
 
@@ -121,21 +121,21 @@ extern void __cmpxchg_wrong_size(void);
 	__typeof__(*(ptr)) __new = (new);				\
 	switch (size) {							\
 	case 1:								\
-		asm volatile(lock "cmpxchgb %b1,%2"			\
-			     : "=a"(__ret)				\
-			     : "q"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgb %b2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "q" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	case 2:								\
-		asm volatile(lock "cmpxchgw %w1,%2"			\
-			     : "=a"(__ret)				\
-			     : "r"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgw %w2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	case 4:								\
-		asm volatile(lock "cmpxchgl %1,%2"			\
-			     : "=a"(__ret)				\
-			     : "r"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgl %2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	default:							\
@@ -180,12 +180,12 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr,
 					     unsigned long long new)
 {
 	unsigned long long prev;
-	asm volatile(LOCK_PREFIX "cmpxchg8b %3"
-		     : "=A"(prev)
-		     : "b"((unsigned long)new),
-		       "c"((unsigned long)(new >> 32)),
-		       "m"(*__xg(ptr)),
-		       "0"(old)
+	asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+		     : "=A" (prev),
+		       "+m" (*__xg(ptr))
+		     : "b" ((unsigned long)new),
+		       "c" ((unsigned long)(new >> 32)),
+		       "0" (old)
 		     : "memory");
 	return prev;
 }
@@ -195,12 +195,12 @@ static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
 						   unsigned long long new)
 {
 	unsigned long long prev;
-	asm volatile("cmpxchg8b %3"
-		     : "=A"(prev)
-		     : "b"((unsigned long)new),
-		       "c"((unsigned long)(new >> 32)),
-		       "m"(*__xg(ptr)),
-		       "0"(old)
+	asm volatile("cmpxchg8b %1"
+		     : "=A" (prev),
+		       "+m" (*__xg(ptr))
+		     : "b" ((unsigned long)new),
+		       "c" ((unsigned long)(new >> 32)),
+		       "0" (old)
 		     : "memory");
 	return prev;
 }
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415fae..b92f147339f 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -26,26 +26,26 @@ extern void __cmpxchg_wrong_size(void);
 	switch (size) {							\
 	case 1:								\
 		asm volatile("xchgb %b0,%1"				\
-			     : "=q" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=q" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	case 2:								\
 		asm volatile("xchgw %w0,%1"				\
-			     : "=r" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	case 4:								\
 		asm volatile("xchgl %k0,%1"				\
-			     : "=r" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	case 8:								\
 		asm volatile("xchgq %0,%1"				\
-			     : "=r" (__x)				\
-			     : "m" (*__xg(ptr)), "0" (__x)		\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
 	default:							\
@@ -71,27 +71,27 @@ extern void __cmpxchg_wrong_size(void);
 	__typeof__(*(ptr)) __new = (new);				\
 	switch (size) {							\
 	case 1:								\
-		asm volatile(lock "cmpxchgb %b1,%2"			\
-			     : "=a"(__ret)				\
-			     : "q"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgb %b2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "q" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	case 2:								\
-		asm volatile(lock "cmpxchgw %w1,%2"			\
-			     : "=a"(__ret)				\
-			     : "r"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgw %w2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	case 4:								\
-		asm volatile(lock "cmpxchgl %k1,%2"			\
-			     : "=a"(__ret)				\
-			     : "r"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgl %k2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	case 8:								\
-		asm volatile(lock "cmpxchgq %1,%2"			\
-			     : "=a"(__ret)				\
-			     : "r"(__new), "m"(*__xg(ptr)), "0"(__old)	\
+		asm volatile(lock "cmpxchgq %2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
 	default:							\
-- 
cgit v1.2.3


From c7f52cdc2f3e1733d3864e439ac2e92edd99ef31 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 22 Jul 2010 22:58:01 -0700
Subject: support multiple .discard.* sections to avoid section type conflicts

gcc 4.4.4 will complain if you use a .discard section for both text and
data ("causes a section type conflict").  Add support for ".discard.*"
sections, and use .discard.text for a dummy function in the x86
RESERVE_BRK() macro.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/setup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f417..ef292c792d7 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
  * executable.)
  */
 #define RESERVE_BRK(name,sz)						\
-	static void __section(.discard) __used				\
+	static void __section(.discard.text) __used			\
 	__brk_reservation_fn_##name##__(void) {				\
 		asm volatile (						\
 			".pushsection .brk_reservation,\"aw\",@nobits;" \
-- 
cgit v1.2.3


From 69309a05907546fb686b251d4ab041c26afe1e1d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 27 Jul 2010 23:29:52 -0700
Subject: x86, asm: Clean up and simplify set_64bit()

Clean up and simplify set_64bit().  This code is quite old (1.3.11)
and contains a fair bit of auxilliary machinery that current versions
of gcc handle just fine automatically.  Worse, the auxilliary
machinery can actually cause an unnecessary spill to memory.

Furthermore, the loading of the old value inside the loop in the
32-bit case is unnecessary: if the value doesn't match, the CMPXCHG8B
instruction will already have loaded the "new previous" value for us.

Clean up the comment, too, and remove page references to obsolete
versions of the Intel SDM.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <tip-*@vger.kernel.org>
---
 arch/x86/include/asm/cmpxchg_32.h | 67 ++++++++++++---------------------------
 arch/x86/include/asm/cmpxchg_64.h |  4 +--
 2 files changed, 21 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index c1cf59d72f0..20955ea7bc1 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -53,60 +53,33 @@ struct __xchg_dummy {
 	__xchg((v), (ptr), sizeof(*ptr))
 
 /*
- * The semantics of XCHGCMP8B are a bit strange, this is why
- * there is a loop and the loading of %%eax and %%edx has to
- * be inside. This inlines well in most cases, the cached
- * cost is around ~38 cycles. (in the future we might want
- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
- * might have an implicit FPU-save as a cost, so it's not
- * clear which path to go.)
+ * CMPXCHG8B only writes to the target if we had the previous
+ * value in registers, otherwise it acts as a read and gives us the
+ * "new previous" value.  That is why there is a loop.  Preloading
+ * EDX:EAX is a performance optimization: in the common case it means
+ * we need only one locked operation.
  *
- * cmpxchg8b must be used with the lock prefix here to allow
- * the instruction to be executed atomically, see page 3-102
- * of the instruction set reference 24319102.pdf. We need
- * the reader side to see the coherent 64bit value.
+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
+ * least an FPU save and/or %cr0.ts manipulation.
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow the
+ * instruction to be executed atomically.  We need to have the reader
+ * side to see the coherent 64bit value.
  */
-static inline void __set_64bit(unsigned long long *ptr,
-			       unsigned int low, unsigned int high)
+static inline void set_64bit(volatile u64 *ptr, u64 value)
 {
+	u32 low  = value;
+	u32 high = value >> 32;
+	u64 prev = *ptr;
+
 	asm volatile("\n1:\t"
-		     "movl (%1), %%eax\n\t"
-		     "movl 4(%1), %%edx\n\t"
-		     LOCK_PREFIX "cmpxchg8b (%1)\n\t"
+		     LOCK_PREFIX "cmpxchg8b %0\n\t"
 		     "jnz 1b"
-		     : "=m" (*ptr)
-		     : "D" (ptr),
-		       "b" (low),
-		       "c" (high)
-		     : "ax", "dx", "memory");
-}
-
-static inline void __set_64bit_constant(unsigned long long *ptr,
-					unsigned long long value)
-{
-	__set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
-}
-
-#define ll_low(x)	*(((unsigned int *)&(x)) + 0)
-#define ll_high(x)	*(((unsigned int *)&(x)) + 1)
-
-static inline void __set_64bit_var(unsigned long long *ptr,
-				   unsigned long long value)
-{
-	__set_64bit(ptr, ll_low(value), ll_high(value));
+		     : "=m" (*ptr), "+A" (prev)
+		     : "b" (low), "c" (high)
+		     : "memory");
 }
 
-#define set_64bit(ptr, value)			\
-	(__builtin_constant_p((value))		\
-	 ? __set_64bit_constant((ptr), (value))	\
-	 : __set_64bit_var((ptr), (value)))
-
-#define _set_64bit(ptr, value)						\
-	(__builtin_constant_p(value)					\
-	 ? __set_64bit(ptr, (unsigned int)(value),			\
-		       (unsigned int)((value) >> 32))			\
-	 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
-
 extern void __cmpxchg_wrong_size(void);
 
 /*
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index b92f147339f..9596e7c6196 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -5,13 +5,11 @@
 
 #define __xg(x) ((volatile long *)(x))
 
-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
+static inline void set_64bit(volatile u64 *ptr, u64 val)
 {
 	*ptr = val;
 }
 
-#define _set_64bit set_64bit
-
 extern void __xchg_wrong_size(void);
 extern void __cmpxchg_wrong_size(void);
 
-- 
cgit v1.2.3


From 18642a57df02a044b91219d3176128996ddc81a5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 27 Jul 2010 23:52:29 -0700
Subject: x86, vdso: Don't quote $nm in the script for checking vdso references

Don't quote $nm in the script for checking the vdso for external
references.  Doing so breaks multiword constructs, like using
CROSS_COMPILE='ccache '.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <20100728134252.2e4c27cf.sfr@canb.auug.org.au>
---
 arch/x86/vdso/checkundef.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
index 490be1c38f9..7ee90a9b549 100755
--- a/arch/x86/vdso/checkundef.sh
+++ b/arch/x86/vdso/checkundef.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 nm="$1"
 file="$2"
-"$nm" "$file" | grep '^ *U' > /dev/null 2>&1
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
 if [ $? -eq 1 ]; then
     exit 0
 else
-- 
cgit v1.2.3


From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:25 -0500
Subject: fanotify: fanotify_init syscall declaration

This patch defines a new syscall fanotify_init() of the form:

int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
		      unsigned int priority)

This syscall is used to create and fanotify group.  This is very similar to
the inotify_init() syscall.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa..586cb3be2e3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,5 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_fanotify_init
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a..981c7e7ad80 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_fanotify_init	338
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 339
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81..4f23e04bdb3 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_fanotify_init			300
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b372934121..e38793b50e1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_fanotify_init
-- 
cgit v1.2.3


From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: sys_fanotify_mark declartion

This patch simply declares the new sys_fanotify_mark syscall

int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask,
		  int dfd const char *pathname)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/ia32/sys_ia32.c           | 9 +++++++++
 arch/x86/include/asm/sys_ia32.h    | 3 +++
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 6 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 586cb3be2e3..17cf65c9480 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -843,4 +843,5 @@ ia32_sys_call_table:
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
 	.quad sys_fanotify_init
+	.quad sys32_fanotify_mark
 ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88..3d093311d5e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 			     ((u64)len_hi << 32) | len_lo);
 }
+
+asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
+				    u32 mask_lo, u32 mask_hi,
+				    int fd, const char  __user *pathname)
+{
+	return sys_fanotify_mark(fanotify_fd, flags,
+				 ((u64)mask_hi << 32) | mask_lo,
+				 fd, pathname);
+}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae..cf4e2e381cb 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
 
 /* ia32/ipc32.c */
 asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+
+asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
+				    const char __user *);
 #endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 981c7e7ad80..80b799cd74f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,10 +344,11 @@
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
 #define __NR_fanotify_init	338
+#define __NR_fanotify_mark	339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 339
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4f23e04bdb3..5b7b1d58561 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -665,6 +665,8 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 #define __NR_fanotify_init			300
 __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark			301
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e38793b50e1..07ad5eb7cc5 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,3 +338,4 @@ ENTRY(sys_call_table)
 	.long sys_perf_event_open
 	.long sys_recvmmsg
 	.long sys_fanotify_init
+	.long sys_fanotify_mark
-- 
cgit v1.2.3


From d78d671db478eb8b14c78501c0cee1cc7baf6967 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:30 +0200
Subject: x86, cpu: AMD errata checking framework

Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM()
macros. The latter is intended for newer errata that have an OSVW id
assigned, which it takes as first argument. Both take a variable number
of family-specific model-stepping ranges created by AMD_MODEL_RANGE().

Iff an erratum has an OSVW id, OSVW is available on the CPU, and the
OSVW id is known to the hardware, it is used to determine whether an
erratum is present. Otherwise, the model-stepping ranges are matched
against the current CPU to find out whether the erratum applies.

For certain special errata, the code using this framework might have to
conduct further checks to make sure an erratum is really (not) present.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 18 ++++++++++++
 arch/x86/kernel/cpu/amd.c        | 60 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8e..5084c2f5ac2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1025,4 +1025,22 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
 	return ratio;
 }
 
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...)		{ -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...)	{ osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+	((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range)	(((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range)	(((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range)	((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x)	(false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 12b9cff047c..80665410b06 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -609,3 +609,63 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
 };
 
 cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ *	AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ *			   AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ *			   AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+bool cpu_has_amd_erratum(const int *erratum)
+{
+	struct cpuinfo_x86 *cpu = &current_cpu_data;
+	int osvw_id = *erratum++;
+	u32 range;
+	u32 ms;
+
+	/*
+	 * If called early enough that current_cpu_data hasn't been initialized
+	 * yet, fall back to boot_cpu_data.
+	 */
+	if (cpu->x86 == 0)
+		cpu = &boot_cpu_data;
+
+	if (cpu->x86_vendor != X86_VENDOR_AMD)
+		return false;
+
+	if (osvw_id >= 0 && osvw_id < 65536 &&
+	    cpu_has(cpu, X86_FEATURE_OSVW)) {
+		u64 osvw_len;
+
+		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+		if (osvw_id < osvw_len) {
+			u64 osvw_bits;
+
+			rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+			    osvw_bits);
+			return osvw_bits & (1ULL << (osvw_id & 0x3f));
+		}
+	}
+
+	/* OSVW unavailable or ID unknown, match family-model-stepping range */
+	ms = (cpu->x86_model << 8) | cpu->x86_mask;
+	while ((range = *erratum++))
+		if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+		    (ms >= AMD_MODEL_RANGE_START(range)) &&
+		    (ms <= AMD_MODEL_RANGE_END(range)))
+			return true;
+
+	return false;
+}
-- 
cgit v1.2.3


From 9d8888c2a214aece2494a49e699a097c2ba9498b Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:31 +0200
Subject: x86, cpu: Clean up AMD erratum 400 workaround

Remove check_c1e_idle() and use the new AMD errata checking framework
instead.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-2-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/amd.c        |  5 +++++
 arch/x86/kernel/process.c        | 39 ++-------------------------------------
 3 files changed, 8 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5084c2f5ac2..eebdc1fde3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
  * AMD errata checking
  */
 #ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_400[];
 extern bool cpu_has_amd_erratum(const int *);
 
 #define AMD_LEGACY_ERRATUM(...)		{ -1, __VA_ARGS__, 0 }
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 80665410b06..a62a4ae7a11 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -628,6 +628,11 @@ cpu_dev_register(amd_cpu_dev);
  *			   AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
  */
 
+const int amd_erratum_400[] =
+	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+
 bool cpu_has_amd_erratum(const int *erratum)
 {
 	struct cpuinfo_x86 *cpu = &current_cpu_data;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..553b02f1309 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
-/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
-	u64 val;
-	if (c->x86_vendor != X86_VENDOR_AMD)
-		goto no_c1e_idle;
-
-	/* Family 0x0f models < rev F do not have C1E */
-	if (c->x86 == 0x0F && c->x86_model >= 0x40)
-		return 1;
-
-	if (c->x86 == 0x10) {
-		/*
-		 * check OSVW bit for CPUs that are not affected
-		 * by erratum #400
-		 */
-		if (cpu_has(c, X86_FEATURE_OSVW)) {
-			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-			if (val >= 2) {
-				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-				if (!(val & BIT(1)))
-					goto no_c1e_idle;
-			}
-		}
-		return 1;
-	}
-
-no_c1e_idle:
-	return 0;
-}
-
 static cpumask_var_t c1e_mask;
 static int c1e_detected;
 
@@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		 */
 		printk(KERN_INFO "using mwait in idle threads.\n");
 		pm_idle = mwait_idle;
-	} else if (check_c1e_idle(c)) {
+	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
+		/* E400: APIC timer interrupt does not wake up CPU from C1e */
 		printk(KERN_INFO "using C1E aware idle routine\n");
 		pm_idle = c1e_idle;
 	} else
-- 
cgit v1.2.3


From 1be85a6d93f4207d8c2c6238c4a96895e28cefba Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:32 +0200
Subject: x86, cpu: Use AMD errata checking framework for erratum 383

Use the AMD errata checking framework instead of open-coding the test.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-3-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 1 +
 arch/x86/kernel/cpu/amd.c        | 2 ++
 arch/x86/kvm/svm.c               | 3 +--
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eebdc1fde3d..d85637bb950 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
  * AMD errata checking
  */
 #ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
 extern const int amd_erratum_400[];
 extern bool cpu_has_amd_erratum(const int *);
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a62a4ae7a11..30f30dcbdb8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -632,6 +632,8 @@ const int amd_erratum_400[] =
 	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
 			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
 
+const int amd_erratum_383[] =
+	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
 
 bool cpu_has_amd_erratum(const int *erratum)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd2..03b534b34ee 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -383,8 +383,7 @@ static void svm_init_erratum_383(void)
 	int err;
 	u64 val;
 
-	/* Only Fam10h is affected */
-	if (boot_cpu_data.x86 != 0x10)
+	if (!cpu_has_amd_erratum(amd_erratum_383))
 		return;
 
 	/* Use _safe variants to not break nested virtualization */
-- 
cgit v1.2.3


From 4532b305e8f0c238dd73048068ff8a6dd1380291 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 15:18:35 -0700
Subject: x86, asm: Clean up and simplify <asm/cmpxchg.h>

Remove the __xg() hack to create a memory barrier near xchg and
cmpxchg; it has been there since 1.3.11 but should not be necessary
with "asm volatile" and a "memory" clobber, neither of which were
there in the original implementation.

However, we *should* make this a volatile reference.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <AANLkTikAmaDPji-TVDarmG1yD=fwbffcsmEU=YEuP+8r@mail.gmail.com>
---
 arch/x86/include/asm/cmpxchg_32.h | 75 ++++++++++++++++++++++-----------------
 arch/x86/include/asm/cmpxchg_64.h | 61 +++++++++++++++++++++----------
 2 files changed, 84 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 20955ea7bc1..f5bd1fd388f 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
 extern void __xchg_wrong_size(void);
 
 /*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- *	  but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
  */
-
-struct __xchg_dummy {
-	unsigned long a[100];
-};
-#define __xg(x) ((struct __xchg_dummy *)(x))
-
 #define __xchg(x, ptr, size)						\
 ({									\
 	__typeof(*(ptr)) __x = (x);					\
 	switch (size) {							\
 	case 1:								\
-		asm volatile("xchgb %b0,%1"				\
-			     : "=q" (__x), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
+		asm volatile("xchgb %0,%1"				\
+			     : "=q" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 2:								\
-		asm volatile("xchgw %w0,%1"				\
-			     : "=r" (__x), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
+		asm volatile("xchgw %0,%1"				\
+			     : "=r" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 4:								\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
 		asm volatile("xchgl %0,%1"				\
-			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "=r" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	default:							\
 		__xchg_wrong_size();					\
 	}								\
@@ -94,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
 	__typeof__(*(ptr)) __new = (new);				\
 	switch (size) {							\
 	case 1:								\
-		asm volatile(lock "cmpxchgb %b2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
+		asm volatile(lock "cmpxchgb %2,%1"			\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "q" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 2:								\
-		asm volatile(lock "cmpxchgw %w2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
+		asm volatile(lock "cmpxchgw %2,%1"			\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 4:								\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
 		asm volatile(lock "cmpxchgl %2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
@@ -148,31 +161,27 @@ extern void __cmpxchg_wrong_size(void);
 					       (unsigned long long)(n)))
 #endif
 
-static inline unsigned long long __cmpxchg64(volatile void *ptr,
-					     unsigned long long old,
-					     unsigned long long new)
+static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
 {
-	unsigned long long prev;
+	u64 prev;
 	asm volatile(LOCK_PREFIX "cmpxchg8b %1"
 		     : "=A" (prev),
-		       "+m" (*__xg(ptr))
-		     : "b" ((unsigned long)new),
-		       "c" ((unsigned long)(new >> 32)),
+		       "+m" (*ptr)
+		     : "b" ((u32)new),
+		       "c" ((u32)(new >> 32)),
 		       "0" (old)
 		     : "memory");
 	return prev;
 }
 
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
-						   unsigned long long old,
-						   unsigned long long new)
+static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 {
-	unsigned long long prev;
+	u64 prev;
 	asm volatile("cmpxchg8b %1"
 		     : "=A" (prev),
-		       "+m" (*__xg(ptr))
-		     : "b" ((unsigned long)new),
-		       "c" ((unsigned long)(new >> 32)),
+		       "+m" (*ptr)
+		     : "b" ((u32)new),
+		       "c" ((u32)(new >> 32)),
 		       "0" (old)
 		     : "memory");
 	return prev;
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 9596e7c6196..423ae58aa02 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,8 +3,6 @@
 
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
-#define __xg(x) ((volatile long *)(x))
-
 static inline void set_64bit(volatile u64 *ptr, u64 val)
 {
 	*ptr = val;
@@ -14,38 +12,51 @@ extern void __xchg_wrong_size(void);
 extern void __cmpxchg_wrong_size(void);
 
 /*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- *	  but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
  */
 #define __xchg(x, ptr, size)						\
 ({									\
 	__typeof(*(ptr)) __x = (x);					\
 	switch (size) {							\
 	case 1:								\
-		asm volatile("xchgb %b0,%1"				\
-			     : "=q" (__x), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
+		asm volatile("xchgb %0,%1"				\
+			     : "=q" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 2:								\
-		asm volatile("xchgw %w0,%1"				\
-			     : "=r" (__x), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
+		asm volatile("xchgw %0,%1"				\
+			     : "=r" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 4:								\
-		asm volatile("xchgl %k0,%1"				\
-			     : "=r" (__x), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
+		asm volatile("xchgl %0,%1"				\
+			     : "=r" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 8:								\
+	{								\
+		volatile u64 *__ptr = (volatile u64 *)(ptr);		\
 		asm volatile("xchgq %0,%1"				\
-			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "=r" (__x), "+m" (*__ptr)		\
 			     : "0" (__x)				\
 			     : "memory");				\
 		break;							\
+	}								\
 	default:							\
 		__xchg_wrong_size();					\
 	}								\
@@ -69,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
 	__typeof__(*(ptr)) __new = (new);				\
 	switch (size) {							\
 	case 1:								\
-		asm volatile(lock "cmpxchgb %b2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
+		asm volatile(lock "cmpxchgb %2,%1"			\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "q" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 2:								\
-		asm volatile(lock "cmpxchgw %w2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
+		asm volatile(lock "cmpxchgw %2,%1"			\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 4:								\
-		asm volatile(lock "cmpxchgl %k2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
+		asm volatile(lock "cmpxchgl %2,%1"			\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	case 8:								\
+	{								\
+		volatile u64 *__ptr = (volatile u64 *)(ptr);		\
 		asm volatile(lock "cmpxchgq %2,%1"			\
-			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
 		break;							\
+	}								\
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
-- 
cgit v1.2.3


From a5b91606bdc9d0a0d036d2d829a22921c705573e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 16:23:20 -0700
Subject: x86, cpu: Export AMD errata definitions

Exprot the AMD errata definitions, since they are needed by kvm_amd.ko
if that is built as a module.  Doing "make allmodconfig" during
testing would have caught this.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com>
---
 arch/x86/kernel/cpu/amd.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 30f30dcbdb8..60a57b13082 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -631,9 +631,11 @@ cpu_dev_register(amd_cpu_dev);
 const int amd_erratum_400[] =
 	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
 			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
 
 const int amd_erratum_383[] =
 	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
 
 bool cpu_has_amd_erratum(const int *erratum)
 {
@@ -676,3 +678,5 @@ bool cpu_has_amd_erratum(const int *erratum)
 
 	return false;
 }
+
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
-- 
cgit v1.2.3


From 90c8f92f5c807807ca74d5f2f313794925174e6b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 16:53:49 -0700
Subject: x86, asm: Move cmpxchg emulation code to arch/x86/lib

Move cmpxchg emulation code from arch/x86/kernel/cpu (which is
otherwise CPU identification) to arch/x86/lib, where other emulation
code lives already.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <AANLkTikAmaDPji-TVDarmG1yD=fwbffcsmEU=YEuP+8r@mail.gmail.com>
---
 arch/x86/kernel/cpu/Makefile  |  2 +-
 arch/x86/kernel/cpu/cmpxchg.c | 72 -------------------------------------------
 arch/x86/lib/Makefile         |  1 +
 arch/x86/lib/cmpxchg.c        | 72 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 74 insertions(+), 73 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/cmpxchg.c
 create mode 100644 arch/x86/lib/cmpxchg.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6..c47c43914ba 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,7 +16,7 @@ obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
-obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32)	+= bugs.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572c..00000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
-	u8 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u8 *)ptr;
-	if (prev == old)
-		*(u8 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
-	u16 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u16 *)ptr;
-	if (prev == old)
-		*(u16 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
-	u32 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u32 *)ptr;
-	if (prev == old)
-		*(u32 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-	u64 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u64 *)ptr;
-	if (prev == old)
-		*(u64 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b696..e10cf070ede 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
         lib-y += checksum_32.o
         lib-y += strstr_32.o
         lib-y += semaphore_32.o string_32.o
+        lib-y += cmpxchg.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c
new file mode 100644
index 00000000000..2056ccf572c
--- /dev/null
+++ b/arch/x86/lib/cmpxchg.c
@@ -0,0 +1,72 @@
+/*
+ * cmpxchg*() fallbacks for CPU not supporting these instructions
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+	u8 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u8 *)ptr;
+	if (prev == old)
+		*(u8 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+	u16 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u16 *)ptr;
+	if (prev == old)
+		*(u16 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+	u32 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u32 *)ptr;
+	if (prev == old)
+		*(u32 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
-- 
cgit v1.2.3


From a378d9338e8dde78314b3a6ae003de351936c729 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 17:05:11 -0700
Subject: x86, asm: Merge cmpxchg_486_u64() and cmpxchg8b_emu()

We have two functions for doing exactly the same thing -- emulating
cmpxchg8b on 486 and older hardware -- with different calling
conventions, and yet doing the same thing.  Drop the C version and use
the assembly version, via alternatives, for both the local and
non-local versions of cmpxchg8b.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <AANLkTikAmaDPji-TVDarmG1yD=fwbffcsmEU=YEuP+8r@mail.gmail.com>
---
 arch/x86/include/asm/cmpxchg_32.h | 30 ++++++++++++++----------------
 arch/x86/lib/cmpxchg.c            | 18 ------------------
 2 files changed, 14 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f5bd1fd388f..284a6e8f7ce 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -246,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
  * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
  */
 
-extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-
 #define cmpxchg64(ptr, o, n)					\
 ({								\
 	__typeof__(*(ptr)) __ret;				\
@@ -265,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
 	__ret; })
 
 
-
-#define cmpxchg64_local(ptr, o, n)					\
-({									\
-	__typeof__(*(ptr)) __ret;					\
-	if (likely(boot_cpu_data.x86 > 4))				\
-		__ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr),	\
-				(unsigned long long)(o),		\
-				(unsigned long long)(n));		\
-	else								\
-		__ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr),	\
-				(unsigned long long)(o),		\
-				(unsigned long long)(n));		\
-	__ret;								\
-})
+#define cmpxchg64_local(ptr, o, n)				\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__typeof__(*(ptr)) __old = (o);				\
+	__typeof__(*(ptr)) __new = (n);				\
+	alternative_io("call cmpxchg8b_emu",			\
+		       "cmpxchg8b (%%esi)" ,			\
+		       X86_FEATURE_CX8,				\
+		       "=A" (__ret),				\
+		       "S" ((ptr)), "0" (__old),		\
+		       "b" ((unsigned int)__new),		\
+		       "c" ((unsigned int)(__new>>32))		\
+		       : "memory");				\
+	__ret; })
 
 #endif
 
diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572c..5d619f6df3e 100644
--- a/arch/x86/lib/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
 }
 EXPORT_SYMBOL(cmpxchg_386_u32);
 #endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-	u64 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u64 *)ptr;
-	if (prev == old)
-		*(u64 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
-- 
cgit v1.2.3


From 3709c857350976408953831f0cf89d19951394a1 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Thu, 22 Jul 2010 14:57:35 +0900
Subject: x86: Ioremap: fix wrong physical address handling in PAT code

The following two commits fixed a problem that x86 ioremap() doesn't handle
physical address higher than 32-bit properly in X86_32 PAE mode.

 ffa71f33a820d1ab3f2fc5723819ac60fb76080b (x86, ioremap: Fix incorrect
 physical address handling in PAE mode)

 35be1b716a475717611b2dc04185e9d80b9cb693 (x86, ioremap: Fix normal
 ram range check)

But these fixes are not enough, since pat_pagerange_is_ram() in PAT code
also has a same problem. This patch fixes it.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
LKML-Reference: <4C47DDCF.80300@jp.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index acc15b23b74..03b48c80c65 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 	return req_type;
 }
 
-static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 {
 	int ram_page = 0, not_rampage = 0;
 	unsigned long page_nr;
-- 
cgit v1.2.3


From ca65f9fc0c447da5b270b05c41c21b19c88617c3 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 29 Jul 2010 14:37:48 +0100
Subject: Introduce CONFIG_XEN_PVHVM compile option

This patch introduce a CONFIG_XEN_PVHVM compile time option to
enable/disable Xen PV on HVM support.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/kernel/cpu/hypervisor.c   | 2 +-
 arch/x86/xen/Kconfig               | 5 +++++
 arch/x86/xen/enlighten.c           | 2 ++
 arch/x86/xen/mmu.c                 | 2 ++
 arch/x86/xen/platform-pci-unplug.c | 2 ++
 arch/x86/xen/time.c                | 3 ++-
 6 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 5bccedcb912..8095f8611f8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,7 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PVHVM
 	&x86_hyper_xen_hvm,
 #endif
 };
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb..68128a1b401 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.
 
+config XEN_PVHVM
+	def_bool y
+	depends on XEN
+	depends on X86_LOCAL_APIC
+
 config XEN_MAX_DOMAIN_MEMORY
        int "Maximum allowed size of a domain in gigabytes"
        default 8 if X86_32
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b479a684f..6f5345378ab 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1282,6 +1282,7 @@ void xen_hvm_init_shared_info(void)
 	}
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@@ -1338,3 +1339,4 @@ const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
 	.init_platform		= xen_hvm_guest_init,
 };
 EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 84648c1bf13..413b19b3d0f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1942,6 +1942,7 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static void xen_hvm_exit_mmap(struct mm_struct *mm)
 {
 	struct xen_hvm_pagetable_dying a;
@@ -1973,6 +1974,7 @@ void __init xen_hvm_init_mmu_ops(void)
 	if (is_pagetable_dying_supported())
 		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
 }
+#endif
 
 #ifdef CONFIG_XEN_DEBUG_FS
 
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 2f7f3fb3477..554c002a1e1 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -32,6 +32,7 @@
 /* store the value of xen_emul_unplug after the unplug is done */
 int xen_platform_pci_unplug;
 EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
 static int xen_emul_unplug;
 
 static int __init check_platform_magic(void)
@@ -133,3 +134,4 @@ static int __init parse_xen_emul_unplug(char *arg)
 	return 0;
 }
 early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 4780e55886a..2aab4a2b910 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -516,6 +516,7 @@ __init void xen_init_time_ops(void)
 	x86_platform.set_wallclock = xen_set_wallclock;
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static void xen_hvm_setup_cpu_clockevents(void)
 {
 	int cpu = smp_processor_id();
@@ -544,4 +545,4 @@ __init void xen_hvm_init_time_ops(void)
 	x86_platform.get_wallclock = xen_get_wallclock;
 	x86_platform.set_wallclock = xen_set_wallclock;
 }
-
+#endif
-- 
cgit v1.2.3


From 73cd3b43f08cc9a9bcb168994b8e9ebd983ff573 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 15 Jun 2010 15:43:19 +0200
Subject: x86/PCI: pci, fix section mismatch

pcibios_scan_specific_bus calls pci_scan_bus_on_node which is
__devinit. Mark pcibios_scan_specific_bus __devinit as well since
all users are now __init or __devinit.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524..c89266be604 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
 	return 0;
 }
 
-void pcibios_scan_specific_bus(int busn)
+void __devinit pcibios_scan_specific_bus(int busn)
 {
 	int devfn;
 	long node;
-- 
cgit v1.2.3


From 7bd1c365fd124624191d49dcc1eb9759d6017ec3 Mon Sep 17 00:00:00 2001
From: Mike Habeck <habeck@sgi.com>
Date: Wed, 12 May 2010 11:14:32 -0700
Subject: x86/PCI: Add option to not assign BAR's if not already assigned

The Linux kernel assigns BARs that a BIOS did not assign, most likely
to handle broken BIOSes that didn't enumerate the devices correctly.
On UV the BIOS purposely doesn't assign I/O BARs for certain devices/
drivers we know don't use them (examples, LSI SAS, Qlogic FC, ...).
We purposely don't assign these I/O BARs because I/O Space is a very
limited resource.  There is only 64k of I/O Space, and in a PCIe
topology that space gets divided up into 4k chucks (this is due to
the fact that a pci-to-pci bridge's I/O decoder is aligned at 4k)...
Thus a system can have at most 16 cards with I/O BARs: (64k / 4k = 16)

SGI needs to scale to >16 devices with I/O BARs.  So by not assigning
I/O BARs on devices we know don't use them, we can do that (iff the
kernel doesn't go and assign these BARs that the BIOS purposely didn't
assign).

This patch will not assign a resource to a device BAR if that BAR was
not assigned by the BIOS, and the kernel cmdline option 'pci=nobar'
was specified.   This patch is closely modeled after the 'pci=norom'
option that currently exists in the tree.

Signed-off-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h |  1 +
 arch/x86/pci/common.c          | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb..49c7219826f 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
 #define PCI_HAS_IO_ECS		0x40000
 #define PCI_NOASSIGN_ROMS	0x80000
 #define PCI_ROOT_NO_CRS		0x100000
+#define PCI_NOASSIGN_BARS	0x200000
 
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050..a0772af64ef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
 static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
 {
 	struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+	struct resource *bar_r;
+	int bar;
+
+	if (pci_probe & PCI_NOASSIGN_BARS) {
+		/*
+		* If the BIOS did not assign the BAR, zero out the
+		* resource so the kernel doesn't attmept to assign
+		* it later on in pci_assign_unassigned_resources
+		*/
+		for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+			bar_r = &dev->resource[bar];
+			if (bar_r->start == 0 && bar_r->end != 0) {
+				bar_r->flags = 0;
+				bar_r->end = 0;
+			}
+		}
+	}
 
 	if (pci_probe & PCI_NOASSIGN_ROMS) {
 		if (rom_r->parent)
@@ -509,6 +526,9 @@ char * __devinit  pcibios_setup(char *str)
 	} else if (!strcmp(str, "norom")) {
 		pci_probe |= PCI_NOASSIGN_ROMS;
 		return NULL;
+	} else if (!strcmp(str, "nobar")) {
+		pci_probe |= PCI_NOASSIGN_BARS;
+		return NULL;
 	} else if (!strcmp(str, "assign-busses")) {
 		pci_probe |= PCI_ASSIGN_ALL_BUSSES;
 		return NULL;
-- 
cgit v1.2.3


From 2491762cfb475dbdfa3db11ebea6de49f58b7fac Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Fri, 23 Jul 2010 12:53:27 -0600
Subject: x86/PCI: use host bridge _CRS info on ASRock ALiveSATA2-GLAN

This DMI quirk turns on "pci=use_crs" for the ALiveSATA2-GLAN because
amd_bus.c doesn't handle this system correctly.

The system has a single HyperTransport I/O chain, but has two PCI host
bridges to buses 00 and 80.  amd_bus.c learns the MMIO range associated
with buses 00-ff and that this range is routed to the HT chain hosted at
node 0, link 0:

    bus: [00, ff] on node 0 link 0
    bus: 00 index 1 [mem 0x80000000-0xfcffffffff]

This includes the address space for both bus 00 and bus 80, and amd_bus.c
assumes it's all routed to bus 00.

We find device 80:01.0, which BIOS left in the middle of that space, but
we don't find a bridge from bus 00 to bus 80, so we conclude that 80:01.0
is unreachable from bus 00, and we move it from the original, working,
address to something outside the bus 00 aperture, which does not work:

    pci 0000:80:01.0: reg 10: [mem 0xfebfc000-0xfebfffff 64bit]
    pci 0000:80:01.0: BAR 0: assigned [mem 0xfd00000000-0xfd00003fff 64bit]

The BIOS told us everything we need to know to handle this correctly,
so we're better off if we just pay attention, which lets us leave the
80:01.0 device at the original, working, address:

    ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-7f])
    pci_root PNP0A03:00: host bridge window [mem 0x80000000-0xff37ffff]
    ACPI: PCI Root Bridge [PCI1] (domain 0000 [bus 80-ff])
    pci_root PNP0A08:00: host bridge window [mem 0xfebfc000-0xfebfffff]

This was a regression between 2.6.33 and 2.6.34.  In 2.6.33, amd_bus.c
was used only when we found multiple HT chains.  3e3da00c01d050, which
enabled amd_bus.c even on systems with a single HT chain, caused this
failure.

This quirk was written by Graham.  If we ever enable "pci=use_crs" for
machines from 2006 or earlir, this quirk should be removed.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=16007

Cc: stable@kernel.org
Reported-by: Graham Ramsey <ramsey.graham@ntlworld.com>
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a6..15466c096ba 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
 		},
 	},
+	/* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
+	/* 2006 AMD HT/VIA system with two host bridges */
+        {
+		.callback = set_use_crs,
+		.ident = "ASRock ALiveSATA2-GLAN",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
+                },
+        },
 	{}
 };
 
-- 
cgit v1.2.3


From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 23 Jul 2010 14:56:28 +0100
Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc()

commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove
unsafe and unnecessary hardware access" changed read_msi_msg_desc() to
return the last MSI message written instead of reading it from the
device, since it may be called while the device is in a reduced
power state.

However, the pSeries platform code really does need to read messages
from the device, since they are initially written by firmware.
Therefore:
- Restore the previous behaviour of read_msi_msg_desc()
- Add new functions get_cached_msi_msg{,_desc}() which return the
  last MSI message written
- Use the new functions where appropriate

Acked-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26..4dc0084ec1b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cfg = desc->chip_data;
 
-	read_msi_msg_desc(desc, &msg);
+	get_cached_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-- 
cgit v1.2.3


From 1f7979ac53224b0208e7d3eaeb5fd72ab9687389 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 3 Jul 2010 20:04:03 +0400
Subject: x86/PCI: use for_each_pci_dev()

Use for_each_pci_dev() to simplify the code.

Signed-off-by: Kulikov Vasiliy <segooon@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c9..f547ee05f71 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
 
 	/* Update IRQ for all devices with the same pirq value */
-	while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+	for_each_pci_dev(dev2) {
 		pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
 		if (!pin)
 			continue;
@@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void)
 	u8 pin;
 
 	DBG(KERN_DEBUG "PCI: IRQ fixup\n");
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+	for_each_pci_dev(dev) {
 		/*
 		 * If the BIOS has set an out of range IRQ number, just
 		 * ignore it.  Also keep track of which IRQ's are
@@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void)
 		return;
 
 	dev = NULL;
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+	for_each_pci_dev(dev) {
 		pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 		if (!pin)
 			continue;
-- 
cgit v1.2.3


From 68f202e4e87cfab4439568bf397fcc5c7cf8d729 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Jul 2010 11:46:42 -0700
Subject: x86, mtrr: Use stop machine context to rendezvous all the cpu's

Use the stop machine context rather than IPI's to rendezvous all the cpus for
MTRR initialization that happens during cpu bringup or for MTRR modifications
during runtime.

This avoids deadlock scenario (reported by Prarit) like:

cpu A holds a read_lock (tasklist_lock for example) with irqs enabled
cpu B waits for the same lock with irqs disabled using write_lock_irq
cpu C doing set_mtrr() (during AP bringup for example), which will try to
rendezvous all the cpus using IPI's

This will result in C and A come to the rendezvous point and waiting
for B. B is stuck forever waiting for the lock and thus not
reaching the rendezvous point.

Using stop cpu (run in the process context of per cpu based keventd) to do
this rendezvous, avoids this deadlock scenario.

Also make sure all the cpu's are in the rendezvous handler before we proceed
with the local_irq_save() on each cpu. This lock step disabling irqs on all
the cpus will avoid other deadlock scenarios (for example involving
with the blocking smp_call_function's etc).

   [ This problem is very old. Marking -stable only for 2.6.35 as the
     stop_one_cpu_nowait() API is present only in 2.6.35. Any older
     kernel interested in this fix need to do some more work in backporting
     this patch. ]

Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1280515602.2682.10.camel@sbsiddha-MOBL3.sc.intel.com>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@kernel.org	[2.6.35]
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/main.c | 56 +++++++++++++++++++++++++++++++----------
 arch/x86/kernel/smpboot.c       |  7 ++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b60..01c0f3ee6cc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
 
 #include <linux/types.h> /* FIXME: kvm_para.h needs this */
 
+#include <linux/stop_machine.h>
 #include <linux/kvm_para.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
 	mtrr_type	smp_type;
 };
 
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
+
 /**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
  * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
 {
 #ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
 	unsigned long flags;
 
+	atomic_dec(&data->count);
+	while (!atomic_read(&data->gate))
+		cpu_relax();
+
 	local_irq_save(flags);
 
 	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
+	while (atomic_read(&data->gate))
 		cpu_relax();
 
 	/*  The master has cleared me to execute  */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
 	}
 
 	atomic_dec(&data->count);
-	while (atomic_read(&data->gate))
+	while (!atomic_read(&data->gate))
 		cpu_relax();
 
 	atomic_dec(&data->count);
 	local_irq_restore(flags);
 #endif
+	return 0;
 }
 
 static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  *
  * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
  *
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
  * 2. Disable Interrupts
  * 3. Wait for all procs to do so
  * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  * 15. Enable interrupts.
  *
  * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * decrementing the count, We reset data.count and set the data.gate flag
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
  * CPU goes through the transition of updating MTRRs.
  * The CPU vendors may each do it differently,
  * so we call mtrr_if->set() callback and let them take care of it.
  * When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
  * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
  * Everyone then enables interrupts and we all continue on.
  *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 {
 	struct set_mtrr_data data;
 	unsigned long flags;
+	int cpu;
+
+	preempt_disable();
 
 	data.smp_reg = reg;
 	data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	atomic_set(&data.gate, 0);
 
 	/* Start the ball rolling on other CPUs */
-	if (smp_call_function(ipi_handler, &data, 0) != 0)
-		panic("mtrr: timed out waiting for other CPUs\n");
+	for_each_online_cpu(cpu) {
+		struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+	}
 
-	local_irq_save(flags);
 
 	while (atomic_read(&data.count))
 		cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	smp_wmb();
 	atomic_set(&data.gate, 1);
 
+	local_irq_save(flags);
+
+	while (atomic_read(&data.count))
+		cpu_relax();
+
+	/* Ok, reset count and toggle gate */
+	atomic_set(&data.count, num_booting_cpus() - 1);
+	smp_wmb();
+	atomic_set(&data.gate, 0);
+
 	/* Do our MTRR business */
 
 	/*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 
 	atomic_set(&data.count, num_booting_cpus() - 1);
 	smp_wmb();
-	atomic_set(&data.gate, 0);
+	atomic_set(&data.gate, 1);
 
 	/*
 	 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 		cpu_relax();
 
 	local_irq_restore(flags);
+	preempt_enable();
 }
 
 /**
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d..11015fd1abb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,13 @@ do_rest:
 			if (cpumask_test_cpu(cpu, cpu_callin_mask))
 				break;	/* It has booted */
 			udelay(100);
+			/*
+			 * Allow other tasks to run while we wait for the
+			 * AP to come online. This also gives a chance
+			 * for the MTRR work(triggered by the AP coming online)
+			 * to be completed in the stop machine context.
+			 */
+			schedule();
 		}
 
 		if (cpumask_test_cpu(cpu, cpu_callin_mask))
-- 
cgit v1.2.3


From 9792db6174d9927700ed288e6d74b9391bf785d1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:42 -0700
Subject: x86, cpu: Package Level Thermal Control, Power Limit Notification
 definitions

Add package level thermal and power limit feature support.

The two MSRs and features are new starting with Intel's Sandy Bridge processor.

Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit
Notification and 14.6 Package Level Thermal Management.

This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and
THERM_INT_HIGH_ENABLE bit.

[ hpa: fixed up against current tip:x86/cpu ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-2-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h |  2 ++
 arch/x86/include/asm/msr-index.h  | 17 +++++++++++++++--
 arch/x86/kernel/cpu/scattered.c   |  2 ++
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4be50ddd4d7..817aa316b18 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -166,6 +166,8 @@
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_XSAVEOPT	(7*32+ 4) /* Optimized Xsave */
+#define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7cc4a026331..4ea2a7ca7a4 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -224,12 +224,14 @@
 #define MSR_IA32_THERM_CONTROL		0x0000019a
 #define MSR_IA32_THERM_INTERRUPT	0x0000019b
 
-#define THERM_INT_LOW_ENABLE		(1 << 0)
-#define THERM_INT_HIGH_ENABLE		(1 << 1)
+#define THERM_INT_HIGH_ENABLE		(1 << 0)
+#define THERM_INT_LOW_ENABLE		(1 << 1)
+#define THERM_INT_PLN_ENABLE		(1 << 24)
 
 #define MSR_IA32_THERM_STATUS		0x0000019c
 
 #define THERM_STATUS_PROCHOT		(1 << 0)
+#define THERM_STATUS_POWER_LIMIT	(1 << 10)
 
 #define MSR_THERM2_CTL			0x0000019d
 
@@ -241,6 +243,17 @@
 
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
 
+#define MSR_IA32_PACKAGE_THERM_STATUS		0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT		(1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT	(1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT	0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE		(1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE		(1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE		(1 << 24)
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 9815364b477..34b4dad6f0b 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -33,6 +33,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
 		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
 		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 },
+		{ X86_FEATURE_PTS,		CR_EAX, 6, 0x00000006, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
-- 
cgit v1.2.3


From 25971865d48a8d0ece5307a59dbd3f06d05a7567 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 16 Jun 2010 23:19:28 -0400
Subject: x86, olpc: Use pr_debug() for EC commands

Unconditionally printing EC debug messages was helpful when we were actually
debugging the EC, but during normal operation it can get pretty annoying.
Using pr_debug allows us finer-grained control.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100616231928.16b539f0@dev.queued.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/olpc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 156605281f5..f5ff3903b38 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -142,7 +142,7 @@ restart:
 	 * The OBF flag will sometimes misbehave due to what we believe
 	 * is a hardware quirk..
 	 */
-	printk(KERN_DEBUG "olpc-ec:  running cmd 0x%x\n", cmd);
+	pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
 	outb(cmd, 0x6c);
 
 	if (wait_on_ibf(0x6c, 0)) {
@@ -159,8 +159,7 @@ restart:
 						" EC accept data!\n");
 				goto err;
 			}
-			printk(KERN_DEBUG "olpc-ec:  sending cmd arg 0x%x\n",
-					inbuf[i]);
+			pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
 			outb(inbuf[i], 0x68);
 		}
 	}
@@ -173,8 +172,7 @@ restart:
 				goto restart;
 			}
 			outbuf[i] = inb(0x68);
-			printk(KERN_DEBUG "olpc-ec:  received 0x%x\n",
-					outbuf[i]);
+			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
 		}
 	}
 
-- 
cgit v1.2.3


From 54e5bc020ce1c959eaa7be18cedb734b6b13745e Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Mon, 28 Jun 2010 22:00:29 -0400
Subject: x86, olpc: Constify an olpc_ofw() arg

The arguments passed to OFW shouldn't be modified; update the 'args'
argument of olpc_ofw to reflect this.  This saves us some later
casting away of consts.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100628220029.1555ac24@debian>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/olpc_ofw.h | 2 +-
 arch/x86/kernel/olpc.c          | 2 +-
 arch/x86/kernel/olpc_ofw.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 3e63d857c48..08fde475cb3 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -12,7 +12,7 @@
 #define olpc_ofw(name, args, res) \
 	__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
 
-extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 		void **res);
 
 /* determine whether OFW is available and lives in the proper memory */
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index f5ff3903b38..0e0cdde519b 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -188,7 +188,7 @@ static void __init platform_detect(void)
 {
 	size_t propsize;
 	__be32 rev;
-	void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+	const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
 	void *res[] = { &propsize };
 
 	if (olpc_ofw("getprop", args, res) || propsize != 4) {
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index f5d499fbe74..3218aa71ab5 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -40,7 +40,7 @@ void __init setup_olpc_ofw_pgd(void)
 	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
 }
 
-int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 		void **res)
 {
 	int ofw_args[MAXARGS + 3];
-- 
cgit v1.2.3


From c4026cfd8febcd63dd278894108839f30e525a0e Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 30 Jul 2010 14:10:55 -0500
Subject: x86, UV: Initialize BAU hub map

Fix uninitialized uvhub_mask:

- An unitialized bit map variable was causing initialization of
  non-existant hubs (this one causes boot panics).

- And the bit map was too small for large machines.  This patch
  makes it dynamic in size.

- Fix the case where socket 0 has no enabled cpu's. Don't assume
  every hub has a socket 0.

- uv_init_per_cpu() should be __init.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org> # for .35.x
LKML-Reference: <E1Oeuyt-0004XS-0y@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 59efb5390b3..312ef029281 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1484,15 +1484,16 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
 {
 	int i;
 	int cpu;
 	int pnode;
 	int uvhub;
+	int have_hmaster;
 	short socket = 0;
 	unsigned short socket_mask;
-	unsigned int uvhub_mask;
+	unsigned char *uvhub_mask;
 	struct bau_control *bcp;
 	struct uvhub_desc *bdp;
 	struct socket_desc *sdp;
@@ -1516,28 +1517,29 @@ static void uv_init_per_cpu(int nuvhubs)
 	uvhub_descs = (struct uvhub_desc *)
 		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
 	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
-		uvhub_mask |= (1 << uvhub);
+		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
 		/* kludge: 'assuming' one node per socket, and assuming that
 		   disabling a socket just leaves a gap in node numbers */
-		socket = (cpu_to_node(cpu) & 1);;
+		socket = (cpu_to_node(cpu) & 1);
 		bdp->socket_mask |= (1 << socket);
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
 	}
-	uvhub = 0;
-	while (uvhub_mask) {
-		if (!(uvhub_mask & 1))
-			goto nexthub;
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+			continue;
+		have_hmaster = 0;
 		bdp = &uvhub_descs[uvhub];
 		socket_mask = bdp->socket_mask;
 		socket = 0;
@@ -1551,8 +1553,10 @@ static void uv_init_per_cpu(int nuvhubs)
 				bcp->cpu = cpu;
 				if (i == 0) {
 					smaster = bcp;
-					if (socket == 0)
+					if (!have_hmaster) {
+						have_hmaster++;
 						hmaster = bcp;
+					}
 				}
 				bcp->cpus_in_uvhub = bdp->num_cpus;
 				bcp->cpus_in_socket = sdp->num_cpus;
@@ -1566,11 +1570,9 @@ nextsocket:
 			socket++;
 			socket_mask = (socket_mask >> 1);
 		}
-nexthub:
-		uvhub++;
-		uvhub_mask = (uvhub_mask >> 1);
 	}
 	kfree(uvhub_descs);
+	kfree(uvhub_mask);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled = 0;
-- 
cgit v1.2.3


From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 31 May 2010 14:28:19 +0800
Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages

In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after unmapping and before MCE is relayed to guest OS, userspace will
be killed.

The reason is as follows. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then userspace
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.

[xiao: fix warning introduced by avi]

Reported-by: Max Asbock <masbock@linux.vnet.ibm.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 34 ++++++++++++++++++++++++++--------
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a591..b666d8d106a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -32,6 +32,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+	char buf[1];
+	void __user *hva;
+	int r;
+
+	/* Touch the page, so send SIGBUS */
+	hva = (void __user *)gfn_to_hva(kvm, gfn);
+	r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+	kvm_release_pfn_clean(pfn);
+	if (is_hwpoison_pfn(pfn)) {
+		kvm_send_hwpoison_signal(kvm, gfn);
+		return 0;
+	}
+	return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 	int r;
@@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b54..c7f27779c99 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -431,11 +431,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		pgprintk("gfn %lx is mmio\n", walker.gfn);
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
-- 
cgit v1.2.3


From c332c83ae736c72dcf072e96e98a774fce39e722 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 12:24:12 +0300
Subject: KVM: VMX: Simplify vmx_get_nmi_mask()

!! is not needed due to the cast to bool.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe7..64252075796 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2826,9 +2826,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
 	if (!cpu_has_virtual_nmis())
 		return to_vmx(vcpu)->soft_vnmi_blocked;
-	else
-		return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			  GUEST_INTR_STATE_NMI);
+	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
 }
 
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-- 
cgit v1.2.3


From 914ebccd2d8fa439e01fe93b5229534b9e179a69 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 28 Apr 2010 18:50:36 +0900
Subject: KVM: x86: avoid unnecessary bitmap allocation when memslot is clean

Although we always allocate a new dirty bitmap in x86's get_dirty_log(),
it is only used as a zero-source of copy_to_user() and freed right after
that when memslot is clean. This patch uses clear_user() instead of doing
this unnecessary zero-source allocation.

Performance improvement: as we can expect easily, the time needed to
allocate a bitmap is completely reduced. In my test, the improved ioctl
was about 4 to 10 times faster than the original one for clean slots.
Furthermore, reducing memory allocations and copies will produce good
effects to caches too.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64..1b270fd6063 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2797,7 +2797,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	unsigned long n;
 	unsigned long is_dirty = 0;
-	unsigned long *dirty_bitmap = NULL;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -2812,27 +2811,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 
-	r = -ENOMEM;
-	dirty_bitmap = vmalloc(n);
-	if (!dirty_bitmap)
-		goto out;
-	memset(dirty_bitmap, 0, n);
-
 	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
 		is_dirty = memslot->dirty_bitmap[i];
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 		struct kvm_memslots *slots, *old_slots;
+		unsigned long *dirty_bitmap;
 
 		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
 		spin_unlock(&kvm->mmu_lock);
 
-		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-		if (!slots)
-			goto out_free;
+		r = -ENOMEM;
+		dirty_bitmap = vmalloc(n);
+		if (!dirty_bitmap)
+			goto out;
+		memset(dirty_bitmap, 0, n);
 
+		r = -ENOMEM;
+		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		if (!slots) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 
@@ -2841,13 +2843,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		synchronize_srcu_expedited(&kvm->srcu);
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
+
+		r = -EFAULT;
+		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
+		vfree(dirty_bitmap);
+	} else {
+		r = -EFAULT;
+		if (clear_user(log->dirty_bitmap, n))
+			goto out;
 	}
 
 	r = 0;
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-		r = -EFAULT;
-out_free:
-	vfree(dirty_bitmap);
 out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
-- 
cgit v1.2.3


From 08acfa187117046f8b5044b4a4cdc910f3ceeeb5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 13:00:55 +0300
Subject: KVM: kvm_pdptr_read() may sleep

Annotate it thusly.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/kvm_cache_regs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf532..d2a98f8f9af 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
 
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 {
+	might_sleep();  /* on svm */
+
 	if (!test_bit(VCPU_EXREG_PDPTR,
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
-- 
cgit v1.2.3


From 1c11e713576edf33b95669be9c2dc0ff1e0c90d3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 16:05:44 +0300
Subject: KVM: VMX: Avoid writing HOST_CR0 every entry

cr0.ts may change between entries, so we copy cr0 to HOST_CR0 before each
entry.  That is slow, so instead, set HOST_CR0 to have TS set unconditionally
(which is a safe value), and issue a clts() just before exiting vcpu context
if the task indeed owns the fpu.

Saves ~50 cycles/exit.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++------
 arch/x86/kvm/x86.c | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64252075796..59893173425 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -812,6 +812,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 	}
 #endif
+	if (current_thread_info()->status & TS_USEDFPU)
+		clts();
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2507,7 +2509,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
-	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
@@ -3859,11 +3861,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
-	/*
-	 * Loading guest fpu may have cleared host cr0.ts
-	 */
-	vmcs_writel(HOST_CR0, read_cr0());
-
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b270fd6063..801afc6461e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1731,8 +1731,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvm_put_guest_fpu(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
+	kvm_put_guest_fpu(vcpu);
 }
 
 static int is_efer_nx(void)
-- 
cgit v1.2.3


From 9de41573675cbace09b02ef386f3e9c8739d495c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:22 +0300
Subject: KVM: x86 emulator: introduce read cache

Introduce read cache which is needed for instruction that require more
then one exit to userspace. After returning from userspace the instruction
will be re-executed with cached read value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 56 ++++++++++++++++++++++++++++----------
 2 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf207..288cbedcab1 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -186,6 +186,7 @@ struct decode_cache {
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
+	struct read_cache mem_read;
 };
 
 struct x86_emulate_ctxt {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed..776874b8e50 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1263,6 +1263,33 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long addr, void *dest, unsigned size)
+{
+	int rc;
+	struct read_cache *mc = &ctxt->decode.mem_read;
+
+	while (size) {
+		int n = min(size, 8u);
+		size -= n;
+		if (mc->pos < mc->end)
+			goto read_cached;
+
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		mc->end += n;
+
+	read_cached:
+		memcpy(dest, mc->data + mc->pos, n);
+		mc->pos += n;
+		dest += n;
+		addr += n;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
 			   unsigned int size, unsigned short port,
@@ -1504,9 +1531,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-						 c->regs[VCPU_REGS_RSP]),
-				dest, len, ctxt->vcpu);
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt),
+						       c->regs[VCPU_REGS_RSP]),
+			   dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -2475,6 +2502,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int saved_dst_type = c->dst.type;
 
 	ctxt->interruptibility = 0;
+	ctxt->decode.mem_read.pos = 0;
 
 	/* Shadow copy of register state. Committed on successful emulation.
 	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
@@ -2529,20 +2557,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+					&c->src.val, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val = c->src.val;
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src2.ptr,
-					&c->src2.val,
-					c->src2.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
@@ -2553,8 +2577,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
@@ -2981,7 +3005,11 @@ writeback:
 		    (rc->end != 0 && rc->end == rc->pos))
 			ctxt->restart = false;
 	}
-
+	/*
+	 * reset read cache here in case string instruction is restared
+	 * without decoding
+	 */
+	ctxt->decode.mem_read.end = 0;
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
-- 
cgit v1.2.3


From 054fe9f6e3b76877516b37ac7d83d58c7f37c1b6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:23 +0300
Subject: KVM: x86 emulator: fix Move r/m16 to segment register decoding

This instruction does not need generic decoding for its dst operand.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 776874b8e50..a81e6bfcade 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -171,7 +171,7 @@ static u32 opcode_table[256] = {
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-	DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+	ImplicitOps | SrcMem | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-- 
cgit v1.2.3


From f0c13ef1a8f31be08bf1b1244fe4565f11f4b009 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:24 +0300
Subject: KVM: x86 emulator: cleanup xchg emulation

Dst operand is already initialized during decoding stage. No need to
reinitialize.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a81e6bfcade..a99d49cc893 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2804,8 +2804,8 @@ special_insn:
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = c->dst.type = OP_REG;
-		c->src.bytes = c->dst.bytes = c->op_bytes;
+		c->src.type = OP_REG;
+		c->src.bytes = c->op_bytes;
 		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
 		c->src.val = *(c->src.ptr);
 		goto xchg;
-- 
cgit v1.2.3


From b8a98945ea5b735e083eaf92906aa0ff9ece92e8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:25 +0300
Subject: KVM: x86 emulator: cleanup nop emulation

Make it more explicit what we are checking for.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a99d49cc893..03a72912d7b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2799,8 +2799,8 @@ special_insn:
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
-		if (!(c->rex_prefix & 1)) { /* nop */
-			c->dst.type = OP_NONE;
+		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+			c->dst.type = OP_NONE;  /* nop */
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
-- 
cgit v1.2.3


From 414e6277fd148f6470261cef50a7fed0d88a2825 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:26 +0300
Subject: KVM: x86 emulator: handle "far address" source operand

ljmp/lcall instruction operand contains address and segment.
It can be 10 bytes long. Currently we decode it as two different
operands. Fix it by introducing new kind of operand that can hold
entire far address.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  6 +++-
 arch/x86/kvm/emulate.c             | 56 ++++++++++++++++++++++----------------
 2 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 288cbedcab1..69a64a6a36f 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -143,7 +143,11 @@ struct x86_emulate_ops {
 struct operand {
 	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
 	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
+	unsigned long orig_val, *ptr;
+	union {
+		unsigned long val;
+		char valptr[sizeof(unsigned long) + 2];
+	};
 };
 
 struct fetch_cache {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 03a72912d7b..687ea0906b7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -67,6 +67,8 @@
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -88,10 +90,6 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
-#define Src2Imm16   (4<<29)
-#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
-			       in memory and second argument is located
-			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -175,7 +173,7 @@ static u32 opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-	0, 0, SrcImm | Src2Imm16 | No64, 0,
+	0, 0, SrcImmFAddr | No64, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -215,7 +213,7 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
@@ -350,7 +348,7 @@ static u32 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
 	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +574,13 @@ static u32 group2_table[] = {
 	(_type)_x;							\
 })
 
+#define insn_fetch_arr(_arr, _size, _eip)                                \
+({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+})
+
 static inline unsigned long ad_mask(struct decode_cache *c)
 {
 	return (1UL << (c->ad_bytes << 3)) - 1;
@@ -1160,6 +1165,17 @@ done_prefixes:
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.bytes = c->op_bytes + 2;
+		break;
 	}
 
 	/*
@@ -1179,22 +1195,10 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
-	case Src2Imm16:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 2;
-		c->src2.val = insn_fetch(u16, 2, c->eip);
-		break;
 	case Src2One:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
-	case Src2Mem16:
-		c->src2.type = OP_MEM;
-		c->src2.bytes = 2;
-		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
-		c->src2.val = 0;
-		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -2558,7 +2562,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if (c->src.type == OP_MEM) {
 		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
-					&c->src.val, c->src.bytes);
+					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val = c->src.val;
@@ -2884,14 +2888,18 @@ special_insn:
 	}
 	case 0xe9: /* jmp rel */
 		goto jmp;
-	case 0xea: /* jmp far */
+	case 0xea: { /* jmp far */
+		unsigned short sel;
 	jump_far:
-		if (load_segment_descriptor(ctxt, ops, c->src2.val,
-					    VCPU_SREG_CS))
+		memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+		if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
 			goto done;
 
-		c->eip = c->src.val;
+		c->eip = 0;
+		memcpy(&c->eip, c->src.valptr, c->op_bytes);
 		break;
+	}
 	case 0xeb:
 	      jmp:		/* jmp rel short */
 		jmp_rel(c, c->src.val);
-- 
cgit v1.2.3


From 35aa5375d407ecadcc3adb5cb31d27044bf7f29f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:27 +0300
Subject: KVM: x86 emulator: add (set|get)_dr callbacks to x86_emulate_ops

Add (set|get)_dr callbacks to x86_emulate_ops instead of calling
them directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 ++
 arch/x86/include/asm/kvm_host.h    |  4 ----
 arch/x86/kvm/emulate.c             |  7 +++++--
 arch/x86/kvm/x86.c                 | 12 ++++++------
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 69a64a6a36f..c37296d0e90 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -137,6 +137,8 @@ struct x86_emulate_ops {
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffe..97774ae3c87 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -591,10 +591,6 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long value);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 687ea0906b7..8a4aa73ff1e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3132,7 +3132,7 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -3145,7 +3145,10 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
+		ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] &
+			    ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U),
+			ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 801afc6461e..059d63de169 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3620,16 +3620,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 	return X86EMUL_CONTINUE;
 }
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
-	return kvm_get_dr(ctxt->vcpu, dr, dest);
+	return kvm_get_dr(vcpu, dr, dest);
 }
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
-	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
-	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+	return kvm_set_dr(vcpu, dr, value);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3811,6 +3809,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
 	.set_rflags          = emulator_set_rflags,
+	.get_dr              = emulator_get_dr,
+	.set_dr              = emulator_set_dr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 3fb1b5dbd397d16a855c97c3fb80fe6e9196ce7c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:28 +0300
Subject: KVM: x86 emulator: add (set|get)_msr callbacks to x86_emulate_ops

Add (set|get)_msr callbacks to x86_emulate_ops instead of calling
them directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 ++
 arch/x86/kvm/emulate.c             | 36 ++++++++++++++++++------------------
 arch/x86/kvm/x86.c                 |  2 ++
 3 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c37296d0e90..f751657be73 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,8 @@ struct x86_emulate_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
 	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8a4aa73ff1e..7c8ed560fd4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1875,7 +1875,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 }
 
 static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -1890,7 +1890,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 	msr_data >>= 32;
 	cs.selector = (u16)(msr_data & 0xfffc);
 	ss.selector = (u16)(msr_data + 8);
@@ -1907,17 +1907,17 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 #ifdef CONFIG_X86_64
 		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
-		kvm_x86_ops->get_msr(ctxt->vcpu,
-			ctxt->mode == X86EMUL_MODE_PROT64 ?
-			MSR_LSTAR : MSR_CSTAR, &msr_data);
+		ops->get_msr(ctxt->vcpu,
+			     ctxt->mode == X86EMUL_MODE_PROT64 ?
+			     MSR_LSTAR : MSR_CSTAR, &msr_data);
 		c->eip = msr_data;
 
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
 #endif
 	} else {
 		/* legacy mode */
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 		c->eip = (u32)msr_data;
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1927,7 +1927,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 }
 
 static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -1949,7 +1949,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
 		if ((msr_data & 0xfffc) == 0x0) {
@@ -1979,17 +1979,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
 	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
 	c->eip = msr_data;
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
 	c->regs[VCPU_REGS_RSP] = msr_data;
 
 	return X86EMUL_CONTINUE;
 }
 
 static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -2012,7 +2012,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 
 	cs.dpl = 3;
 	ss.dpl = 3;
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
 		cs.selector = (u16)(msr_data + 16);
@@ -3099,7 +3099,7 @@ twobyte_insn:
 		}
 		break;
 	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt);
+		rc = emulate_syscall(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
@@ -3155,7 +3155,7 @@ twobyte_insn:
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
@@ -3164,7 +3164,7 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		} else {
@@ -3175,14 +3175,14 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt);
+		rc = emulate_sysenter(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
 			goto writeback;
 		break;
 	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt);
+		rc = emulate_sysexit(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 059d63de169..e3a5455049b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3811,6 +3811,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_rflags          = emulator_set_rflags,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
+	.set_msr             = kvm_set_msr,
+	.get_msr             = kvm_get_msr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 5951c4423724759906b10a26aa6a8817c4afa615 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:29 +0300
Subject: KVM: x86 emulator: add get_cached_segment_base() callback to
 x86_emulate_ops

On VMX it is expensive to call get_cached_descriptor() just to get segment
base since multiple vmcs_reads are done instead of only one. Introduce
new call back get_cached_segment_base() for efficiency.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 13 +------------
 arch/x86/kvm/x86.c                 |  7 +++++++
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index f751657be73..df53ba2294b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -132,6 +132,7 @@ struct x86_emulate_ops {
 				      int seg, struct kvm_vcpu *vcpu);
 	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7c8ed560fd4..8228778ace3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2097,17 +2097,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
-				      struct x86_emulate_ops *ops,
-				      int seg)
-{
-	struct desc_struct desc;
-	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
-		return get_desc_base(&desc);
-	else
-		return ~0;
-}
-
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops,
 				struct tss_segment_16 *tss)
@@ -2383,7 +2372,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	int ret;
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	ulong old_tss_base =
-		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+		ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
 	u32 desc_limit;
 
 	/* FIXME: old_tss_base == ~0 ? */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3a5455049b..9a469df6011 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3714,6 +3714,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static unsigned long emulator_get_cached_segment_base(int seg,
+						      struct kvm_vcpu *vcpu)
+{
+	return get_segment_base(vcpu, seg);
+}
+
 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
 					   struct kvm_vcpu *vcpu)
 {
@@ -3804,6 +3810,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.get_segment_selector = emulator_get_segment_selector,
 	.set_segment_selector = emulator_set_segment_selector,
+	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
-- 
cgit v1.2.3


From 79168fd1a307ffee46ee03b7f8711559241738c7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:30 +0300
Subject: KVM: x86 emulator: cleanup some direct calls into kvm to use existing
 callbacks

Use callbacks from x86_emulate_ops to access segments instead of calling
into kvm directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 200 ++++++++++++++++++++++++++-----------------------
 1 file changed, 105 insertions(+), 95 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8228778ace3..f56ec486393 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -622,31 +622,35 @@ static void set_seg_override(struct decode_cache *c, int seg)
 	c->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 		return 0;
 
-	return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+	return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 
 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+				       struct x86_emulate_ops *ops,
 				       struct decode_cache *c)
 {
 	if (!c->has_seg_override)
 		return 0;
 
-	return seg_base(ctxt, c->seg_override);
+	return seg_base(ctxt, ops, c->seg_override);
 }
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	return seg_base(ctxt, VCPU_SREG_ES);
+	return seg_base(ctxt, ops, VCPU_SREG_ES);
 }
 
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	return seg_base(ctxt, VCPU_SREG_SS);
+	return seg_base(ctxt, ops, VCPU_SREG_SS);
 }
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -941,7 +945,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
 	switch (mode) {
@@ -1065,7 +1069,7 @@ done_prefixes:
 		set_seg_override(c, VCPU_SREG_DS);
 
 	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, c);
+		c->modrm_ea += seg_override_base(ctxt, ops, c);
 
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
@@ -1161,7 +1165,7 @@ done_prefixes:
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, c),
+			register_address(c,  seg_override_base(ctxt, ops, c),
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
@@ -1257,7 +1261,7 @@ done_prefixes:
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt),
+			register_address(c, es_base(ctxt, ops),
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
 		break;
@@ -1516,7 +1520,8 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 
@@ -1524,7 +1529,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
 					       c->regs[VCPU_REGS_RSP]);
 }
 
@@ -1535,7 +1540,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt),
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
 						       c->regs[VCPU_REGS_RSP]),
 			   dest, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1588,15 +1593,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment segment;
 
-	kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
-	c->src.val = segment.selector;
-	emulate_push(ctxt);
+	emulate_push(ctxt, ops);
 }
 
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1614,7 +1618,8 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1624,7 +1629,7 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
 		(reg == VCPU_REGS_RSP) ?
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		++reg;
 	}
 }
@@ -1726,14 +1731,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		old_eip = c->eip;
 		c->eip = c->src.val;
 		c->src.val = old_eip;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	case 4: /* jmp abs */
 		c->eip = c->src.val;
 		break;
 	case 6:	/* push */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	return X86EMUL_CONTINUE;
@@ -1847,39 +1852,40 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
 
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-	struct kvm_segment *cs, struct kvm_segment *ss)
+			struct x86_emulate_ops *ops, struct desc_struct *cs,
+			struct desc_struct *ss)
 {
-	memset(cs, 0, sizeof(struct kvm_segment));
-	kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
-	memset(ss, 0, sizeof(struct kvm_segment));
+	memset(cs, 0, sizeof(struct desc_struct));
+	ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+	memset(ss, 0, sizeof(struct desc_struct));
 
 	cs->l = 0;		/* will be adjusted later */
-	cs->base = 0;		/* flat segment */
+	set_desc_base(cs, 0);	/* flat segment */
 	cs->g = 1;		/* 4kb granularity */
-	cs->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_limit(cs, 0xfffff);	/* 4GB limit */
 	cs->type = 0x0b;	/* Read, Execute, Accessed */
 	cs->s = 1;
 	cs->dpl = 0;		/* will be adjusted later */
-	cs->present = 1;
-	cs->db = 1;
+	cs->p = 1;
+	cs->d = 1;
 
-	ss->unusable = 0;
-	ss->base = 0;		/* flat segment */
-	ss->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_base(ss, 0);	/* flat segment */
+	set_desc_limit(ss, 0xfffff);	/* 4GB limit */
 	ss->g = 1;		/* 4kb granularity */
 	ss->s = 1;
 	ss->type = 0x03;	/* Read/Write, Accessed */
-	ss->db = 1;		/* 32bit stack segment */
+	ss->d = 1;		/* 32bit stack segment */
 	ss->dpl = 0;
-	ss->present = 1;
+	ss->p = 1;
 }
 
 static int
 emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
@@ -1888,19 +1894,21 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 	msr_data >>= 32;
-	cs.selector = (u16)(msr_data & 0xfffc);
-	ss.selector = (u16)(msr_data + 8);
+	cs_sel = (u16)(msr_data & 0xfffc);
+	ss_sel = (u16)(msr_data + 8);
 
 	if (is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 	}
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	c->regs[VCPU_REGS_RCX] = c->eip;
 	if (is_long_mode(ctxt->vcpu)) {
@@ -1930,8 +1938,9 @@ static int
 emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
@@ -1947,7 +1956,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
@@ -1966,18 +1975,20 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
-	cs.selector = (u16)msr_data;
-	cs.selector &= ~SELECTOR_RPL_MASK;
-	ss.selector = cs.selector + 8;
-	ss.selector &= ~SELECTOR_RPL_MASK;
+	cs_sel = (u16)msr_data;
+	cs_sel &= ~SELECTOR_RPL_MASK;
+	ss_sel = cs_sel + 8;
+	ss_sel &= ~SELECTOR_RPL_MASK;
 	if (ctxt->mode == X86EMUL_MODE_PROT64
 		|| is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 	}
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
 	c->eip = msr_data;
@@ -1992,9 +2003,10 @@ static int
 emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
+	u16 cs_sel, ss_sel;
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
@@ -2003,7 +2015,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	if ((c->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
@@ -2015,29 +2027,31 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
-		cs.selector = (u16)(msr_data + 16);
+		cs_sel = (u16)(msr_data + 16);
 		if ((msr_data & 0xfffc) == 0x0) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
-		ss.selector = (u16)(msr_data + 24);
+		ss_sel = (u16)(msr_data + 24);
 		break;
 	case X86EMUL_MODE_PROT64:
-		cs.selector = (u16)(msr_data + 32);
+		cs_sel = (u16)(msr_data + 32);
 		if (msr_data == 0x0) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
-		ss.selector = cs.selector + 8;
-		cs.db = 0;
+		ss_sel = cs_sel + 8;
+		cs.d = 0;
 		cs.l = 1;
 		break;
 	}
-	cs.selector |= SELECTOR_RPL_MASK;
-	ss.selector |= SELECTOR_RPL_MASK;
+	cs_sel |= SELECTOR_RPL_MASK;
+	ss_sel |= SELECTOR_RPL_MASK;
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
 	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
@@ -2061,25 +2075,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 					    struct x86_emulate_ops *ops,
 					    u16 port, u16 len)
 {
-	struct kvm_segment tr_seg;
+	struct desc_struct tr_seg;
 	int r;
 	u16 io_bitmap_ptr;
 	u8 perm, bit_idx = port & 0x7;
 	unsigned mask = (1 << len) - 1;
 
-	kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
-	if (tr_seg.unusable)
+	ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+	if (!tr_seg.p)
 		return false;
-	if (tr_seg.limit < 103)
+	if (desc_limit_scaled(&tr_seg) < 103)
 		return false;
-	r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
-			  NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+			  ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
-	if (io_bitmap_ptr + port/8 > tr_seg.limit)
+	if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
 		return false;
-	r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
-			  ctxt->vcpu, NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+			  &perm, 1, ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
 	if ((perm >> bit_idx) & mask)
@@ -2445,7 +2459,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
 		c->lock_prefix = 0;
 		c->src.val = (unsigned long) error_code;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 	}
 
 	return ret;
@@ -2588,7 +2602,7 @@ special_insn:
 		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x06:		/* push es */
-		emulate_push_sreg(ctxt, VCPU_SREG_ES);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2600,14 +2614,14 @@ special_insn:
 		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x0e:		/* push cs */
-		emulate_push_sreg(ctxt, VCPU_SREG_CS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
 		break;
 	case 0x10 ... 0x15:
 	      adc:		/* adc */
 		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x16:		/* push ss */
-		emulate_push_sreg(ctxt, VCPU_SREG_SS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2619,7 +2633,7 @@ special_insn:
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x1e:		/* push ds */
-		emulate_push_sreg(ctxt, VCPU_SREG_DS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2649,7 +2663,7 @@ special_insn:
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
@@ -2658,7 +2672,7 @@ special_insn:
 			goto done;
 		break;
 	case 0x60:	/* pusha */
-		emulate_pusha(ctxt);
+		emulate_pusha(ctxt, ops);
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
@@ -2672,7 +2686,7 @@ special_insn:
 		break;
 	case 0x68: /* push imm */
 	case 0x6a: /* push imm8 */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
@@ -2752,18 +2766,13 @@ special_insn:
 		break;
 	case 0x88 ... 0x8b:	/* mov */
 		goto mov;
-	case 0x8c: { /* mov r/m, sreg */
-		struct kvm_segment segreg;
-
-		if (c->modrm_reg <= VCPU_SREG_GS)
-			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
-		else {
+	case 0x8c:  /* mov r/m, sreg */
+		if (c->modrm_reg > VCPU_SREG_GS) {
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		c->dst.val = segreg.selector;
+		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
-	}
 	case 0x8d: /* lea r16/r32, m */
 		c->dst.val = c->modrm_ea;
 		break;
@@ -2804,7 +2813,7 @@ special_insn:
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
@@ -2872,7 +2881,7 @@ special_insn:
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	case 0xe9: /* jmp rel */
@@ -2985,11 +2994,12 @@ writeback:
 	c->dst.type = saved_dst_type;
 
 	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-				&c->src);
+		string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+				VCPU_REGS_RSI, &c->src);
 
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+		string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *rc = &ctxt->decode.io_read;
@@ -3188,7 +3198,7 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0xa0:	  /* push fs */
-		emulate_push_sreg(ctxt, VCPU_SREG_FS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3207,7 +3217,7 @@ twobyte_insn:
 		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xa8:	/* push gs */
-		emulate_push_sreg(ctxt, VCPU_SREG_GS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-- 
cgit v1.2.3


From 0f12244fe70e8a94a491f6cd7ed70a352ab6c26c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:31 +0300
Subject: KVM: x86 emulator: make set_cr() callback return error if it fails

Make set_cr() callback return error if it fails instead of injecting #GP
behind emulator's back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   2 +-
 arch/x86/kvm/emulate.c             |  10 ++-
 arch/x86/kvm/x86.c                 | 148 +++++++++++++++++++------------------
 3 files changed, 84 insertions(+), 76 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index df53ba2294b..6c4f4918db5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -135,7 +135,7 @@ struct x86_emulate_ops {
 	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
-	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f56ec486393..061f7d37c9f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,7 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+		kvm_inject_gp(ctxt->vcpu, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 	c->eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -3135,7 +3138,10 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a469df6011..64c6e7a3141 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -414,57 +414,49 @@ out:
 	return changed;
 }
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	cr0 |= X86_CR0_ET;
 
 #ifdef CONFIG_X86_64
-	if (cr0 & 0xffffffff00000000UL) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr0 & 0xffffffff00000000UL)
+		return 1;
 #endif
 
 	cr0 &= ~CR0_RESERVED_BITS;
 
-	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+		return 1;
 
-	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+		return 1;
 
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
 		if ((vcpu->arch.efer & EFER_LME)) {
 			int cs_db, cs_l;
 
-			if (!is_pae(vcpu)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (!is_pae(vcpu))
+				return 1;
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-			if (cs_l) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-
-			}
+			if (cs_l)
+				return 1;
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+			return 1;
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
 	kvm_mmu_reset_context(vcpu);
-	return;
+	return 0;
+}
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	if (__kvm_set_cr0(vcpu, cr0))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
@@ -474,61 +466,56 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 
-	if (cr4 & CR4_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr4 & CR4_RESERVED_BITS)
+		return 1;
 
 	if (is_long_mode(vcpu)) {
-		if (!(cr4 & X86_CR4_PAE)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!(cr4 & X86_CR4_PAE))
+			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		return 1;
+
+	if (cr4 & X86_CR4_VMXE)
+		return 1;
 
-	if (cr4 & X86_CR4_VMXE) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
 	kvm_mmu_reset_context(vcpu);
+
+	return 0;
+}
+
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	if (__kvm_set_cr4(vcpu, cr4))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
-		return;
+		return 0;
 	}
 
 	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (cr3 & CR3_L_MODE_RESERVED_BITS)
+			return 1;
 	} else {
 		if (is_pae(vcpu)) {
-			if (cr3 & CR3_PAE_RESERVED_BITS) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (cr3 & CR3_PAE_RESERVED_BITS)
+				return 1;
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+				return 1;
 		}
 		/*
 		 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +533,34 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * to debug) behavior on the guest side.
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+		return 1;
+	vcpu->arch.cr3 = cr3;
+	vcpu->arch.mmu.new_cr3(vcpu);
+	return 0;
+}
+
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (__kvm_set_cr3(vcpu, cr3))
 		kvm_inject_gp(vcpu, 0);
-	else {
-		vcpu->arch.cr3 = cr3;
-		vcpu->arch.mmu.new_cr3(vcpu);
-	}
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-	if (cr8 & CR8_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr8 & CR8_RESERVED_BITS)
+		return 1;
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
+	return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if (__kvm_set_cr8(vcpu, cr8))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
@@ -3681,27 +3678,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
 	return value;
 }
 
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 {
+	int res = 0;
+
 	switch (cr) {
 	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
-		kvm_set_cr3(vcpu, val);
+		res = __kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
+		res = __kvm_set_cr8(vcpu, val & 0xfUL);
 		break;
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		res = -1;
 	}
+
+	return res;
 }
 
 static int emulator_get_cpl(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 338dbc9781eb5acd0b12809d95d4006135f29767 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:32 +0300
Subject: KVM: x86 emulator: make (get|set)_dr() callback return error if it
 fails

Make (get|set)_dr() callback return error if it fails instead of
injecting exception behind emulator's back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 ++++++---
 arch/x86/kvm/x86.c     | 63 +++++++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 061f7d37c9f..d5979ecc252 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3151,9 +3151,14 @@ twobyte_insn:
 			goto done;
 		}
 
-		ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] &
-			    ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U),
-			ctxt->vcpu);
+		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+				((ctxt->mode == X86EMUL_MODE_PROT64) ?
+				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
+			/* #UD condition is already handled by the code above */
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
+
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 64c6e7a3141..44a546b136f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -573,7 +573,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 	switch (dr) {
 	case 0 ... 3:
@@ -582,29 +582,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 			vcpu->arch.eff_db[dr] = val;
 		break;
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 	case 6:
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 		break;
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 	default: /* 7 */
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -615,28 +607,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 
 	return 0;
 }
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	int res;
+
+	res = __kvm_set_dr(vcpu, dr, val);
+	if (res > 0)
+		kvm_queue_exception(vcpu, UD_VECTOR);
+	else if (res < 0)
+		kvm_inject_gp(vcpu, 0);
+
+	return res;
+}
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 	switch (dr) {
 	case 0 ... 3:
 		*val = vcpu->arch.db[dr];
 		break;
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
-		}
 		/* fall through */
 	case 6:
 		*val = vcpu->arch.dr6;
 		break;
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
-		}
 		/* fall through */
 	default: /* 7 */
 		*val = vcpu->arch.dr7;
@@ -645,6 +646,15 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 
 	return 0;
 }
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+	if (_kvm_get_dr(vcpu, dr, val)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+	return 0;
+}
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
 static inline u32 bit(int bitno)
@@ -3619,12 +3629,13 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 
 int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
-	return kvm_get_dr(vcpu, dr, dest);
+	return _kvm_get_dr(vcpu, dr, dest);
 }
 
 int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
-	return kvm_set_dr(vcpu, dr, value);
+
+	return __kvm_set_dr(vcpu, dr, value);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-- 
cgit v1.2.3


From e680080e653b8c8725ca620bf22a5f8480f40cb5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:33 +0300
Subject: KVM: x86 emulator: fix X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED
 values

Currently X86EMUL_PROPAGATE_FAULT, X86EMUL_RETRY_INSTR and
X86EMUL_CMPXCHG_FAILED have the same value so caller cannot
distinguish why function such as emulator_cmpxchg_emulated()
(which can return both X86EMUL_PROPAGATE_FAULT and
X86EMUL_CMPXCHG_FAILED) failed.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 6c4f4918db5..0cf4311db0d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,9 @@ struct x86_emulate_ctxt;
 #define X86EMUL_UNHANDLEABLE    1
 /* Terminate emulation but return success to the caller. */
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+
 struct x86_emulate_ops {
 	/*
 	 * read_std: Read bytes of standard (non-emulated/special) memory.
-- 
cgit v1.2.3


From 411c35b7ef02aefb91e166ffeffad0891d955fcb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:34 +0300
Subject: KVM: fill in run->mmio details in (read|write)_emulated function

Fill in run->mmio details in (read|write)_emulated function just like
pio does. There is no point in filling only vcpu fields there just to
copy them into vcpu->run a little bit later.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 44a546b136f..b976c4c1fa8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3386,9 +3386,10 @@ mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
 	return X86EMUL_UNHANDLEABLE;
 }
@@ -3436,10 +3437,11 @@ mmio:
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+	memcpy(vcpu->run->mmio.data, val, bytes);
 
 	return X86EMUL_CONTINUE;
 }
@@ -3850,7 +3852,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 {
 	int r, shadow_mask;
 	struct decode_cache *c;
-	struct kvm_run *run = vcpu->run;
 
 	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3937,14 +3938,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (r || vcpu->mmio_is_write) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
-	}
-
 	if (r) {
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			goto done;
-- 
cgit v1.2.3


From c3cd7ffaf57ae6ead5b394cebaeb76164059a57f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:35 +0300
Subject: KVM: x86 emulator: x86_emulate_insn() return -1 only in case of
 emulation failure

Currently emulator returns -1 when emulation failed or IO is needed.
Caller tries to guess whether emulation failed by looking at other
variables. Make it easier for caller to recognise error condition by
always returning -1 in case of failure. For this new emulator
internal return value X86EMUL_IO_NEEDED is introduced. It is used to
distinguish between error condition (which returns X86EMUL_UNHANDLEABLE)
and condition that requires IO exit to userspace to continue emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/x86.c                 | 36 ++++++++++++++++++------------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0cf4311db0d..777240d4524 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -53,6 +53,7 @@ struct x86_emulate_ctxt;
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
 #define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
 #define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED       5 /* IO is needed to complete emulation */
 
 struct x86_emulate_ops {
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b976c4c1fa8..4cb65d82abc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3275,7 +3275,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 		}
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 
@@ -3331,7 +3331,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 		}
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 
@@ -3391,7 +3391,7 @@ mmio:
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
-	return X86EMUL_UNHANDLEABLE;
+	return X86EMUL_IO_NEEDED;
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3863,8 +3863,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	 */
 	cache_all_regs(vcpu);
 
-	vcpu->mmio_is_write = 0;
-
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3938,24 +3936,26 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			goto done;
-		if (!vcpu->mmio_needed) {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return EMULATE_FAIL;
-		}
+	if (vcpu->mmio_needed) {
+		if (vcpu->mmio_is_write)
+			vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
 	}
 
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
+	if (r) { /* emulation failed */
+		/*
+		 * if emulation was due to access to shadowed page table
+		 * and it failed try to unshadow page and re-entetr the
+		 * guest to let CPU execute the instruction.
+		 */
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return EMULATE_DONE;
+
+		trace_kvm_emulate_insn_failed(vcpu);
+		kvm_report_emulation_failure(vcpu, "mmio");
+		return EMULATE_FAIL;
 	}
 
-done:
 	if (vcpu->arch.exception.pending)
 		vcpu->arch.emulate_ctxt.restart = false;
 
-- 
cgit v1.2.3


From f181b96d4c769b8915849eb9070c18116fd8d44e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:36 +0300
Subject: KVM: remove export of emulator_write_emulated()

It is not called directly outside of the file it's defined in anymore.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/x86.c              | 1 -
 2 files changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97774ae3c87..2ca1867ed97 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -628,11 +628,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_write_emulated(unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct kvm_vcpu *vcpu);
-
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4cb65d82abc..15a4b754a45 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3465,7 +3465,6 @@ int emulator_write_emulated(unsigned long addr,
 	}
 	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
 	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-- 
cgit v1.2.3


From 8fe681e984b6505d4d12125c0776399304803ec7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:37 +0300
Subject: KVM: do not inject #PF in (read|write)_emulated() callbacks

Return error to x86 emulator instead of injection exception behind its back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 +++
 arch/x86/kvm/emulate.c             | 12 +++++++++++-
 arch/x86/kvm/x86.c                 | 28 ++++++++++++++--------------
 3 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 777240d4524..b7e00cb21c6 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -94,6 +94,7 @@ struct x86_emulate_ops {
 	int (*read_emulated)(unsigned long addr,
 			     void *val,
 			     unsigned int bytes,
+			     unsigned int *error,
 			     struct kvm_vcpu *vcpu);
 
 	/*
@@ -106,6 +107,7 @@ struct x86_emulate_ops {
 	int (*write_emulated)(unsigned long addr,
 			      const void *val,
 			      unsigned int bytes,
+			      unsigned int *error,
 			      struct kvm_vcpu *vcpu);
 
 	/*
@@ -120,6 +122,7 @@ struct x86_emulate_ops {
 				const void *old,
 				const void *new,
 				unsigned int bytes,
+				unsigned int *error,
 				struct kvm_vcpu *vcpu);
 
 	int (*pio_in_emulated)(int size, unsigned short port, void *val,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d5979ecc252..d7a18a0f80a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1277,6 +1277,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct read_cache *mc = &ctxt->decode.mem_read;
+	u32 err;
 
 	while (size) {
 		int n = min(size, 8u);
@@ -1284,7 +1285,10 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		if (mc->pos < mc->end)
 			goto read_cached;
 
-		rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu);
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			kvm_inject_page_fault(ctxt->vcpu, addr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1789,6 +1793,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct decode_cache *c = &ctxt->decode;
+	u32 err;
 
 	switch (c->dst.type) {
 	case OP_REG:
@@ -1817,13 +1822,18 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
+					&err,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
 					(unsigned long)c->dst.ptr,
 					&c->dst.val,
 					c->dst.bytes,
+					&err,
 					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			kvm_inject_page_fault(ctxt->vcpu,
+					      (unsigned long)c->dst.ptr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15a4b754a45..51402d8a46f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3346,10 +3346,10 @@ out:
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
+				  unsigned int *error_code,
 				  struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3359,12 +3359,10 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3409,17 +3407,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
+					   unsigned int *error_code,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3449,6 +3445,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
+			    unsigned int *error_code,
 			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
@@ -3456,14 +3453,16 @@ int emulator_write_emulated(unsigned long addr,
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+						     vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+					       vcpu);
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -3480,6 +3479,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
+				     unsigned int *error_code,
 				     struct kvm_vcpu *vcpu)
 {
 	gpa_t gpa;
@@ -3533,7 +3533,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-	return emulator_write_emulated(addr, new, bytes, vcpu);
+	return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -4293,7 +4293,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(rip, instruction, 3, vcpu);
+	return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-- 
cgit v1.2.3


From 3457e4192e367fd4e0da5e9f46f9df85fa99cd11 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:38 +0300
Subject: KVM: handle emulation failure case first

If emulation failed return immediately.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51402d8a46f..9e5a833f339 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3924,22 +3924,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-
-	if (r == 0)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
-
-	if (vcpu->arch.pio.count) {
-		if (!vcpu->arch.pio.in)
-			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->mmio_needed) {
-		if (vcpu->mmio_is_write)
-			vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
 
 	if (r) { /* emulation failed */
 		/*
@@ -3955,6 +3939,21 @@ restart:
 		return EMULATE_FAIL;
 	}
 
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+	if (vcpu->arch.pio.count) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.count = 0;
+		return EMULATE_DO_MMIO;
+	}
+
+	if (vcpu->mmio_needed) {
+		if (vcpu->mmio_is_write)
+			vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
+	}
+
 	if (vcpu->arch.exception.pending)
 		vcpu->arch.emulate_ctxt.restart = false;
 
-- 
cgit v1.2.3


From 95c5588652f7742a21c33d9dcce0e043e057d04f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:39 +0300
Subject: KVM: x86 emulator: advance RIP outside x86 emulator code

Return new RIP as part of instruction emulation result instead of
updating KVM's RIP from x86 emulator code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++---
 arch/x86/kvm/x86.c     | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d7a18a0f80a..437f31bcffe 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2496,8 +2496,9 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (rc == X86EMUL_CONTINUE) {
 		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-		kvm_rip_write(ctxt->vcpu, c->eip);
 		rc = writeback(ctxt, ops);
+		if (rc == X86EMUL_CONTINUE)
+			ctxt->eip = c->eip;
 	}
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2554,7 +2555,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 		string_done:
 			ctxt->restart = false;
-			kvm_rip_write(ctxt->vcpu, c->eip);
+			ctxt->eip = c->eip;
 			goto done;
 		}
 		/* The second termination condition only applies for REPE
@@ -3032,7 +3033,7 @@ writeback:
 	ctxt->decode.mem_read.end = 0;
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(ctxt->vcpu, c->eip);
+	ctxt->eip = c->eip;
 	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e5a833f339..8f45cc712dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3941,6 +3941,7 @@ restart:
 
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
@@ -4945,6 +4946,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 		return EMULATE_FAIL;
 
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	return EMULATE_DONE;
 }
-- 
cgit v1.2.3


From ef050dc0390176ec6888f373edb776587c88be3d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:40 +0300
Subject: KVM: x86 emulator: set RFLAGS outside x86 emulator code

Removes the need for set_flags() callback.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 -
 arch/x86/kvm/emulate.c             | 1 -
 arch/x86/kvm/x86.c                 | 7 +------
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7e00cb21c6..a87d95f0957 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,7 +142,6 @@ struct x86_emulate_ops {
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
 	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 437f31bcffe..291e220c69a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3034,7 +3034,6 @@ writeback:
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	ctxt->eip = c->eip;
-	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f45cc712dd..04ca343ee51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3806,11 +3806,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -3829,7 +3824,6 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
-	.set_rflags          = emulator_set_rflags,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
 	.set_msr             = kvm_set_msr,
@@ -3941,6 +3935,7 @@ restart:
 
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
-- 
cgit v1.2.3


From bdb475a323858101f4a5ad6a1a04b1dd8885325a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:41 +0300
Subject: KVM: x86 emulator: use shadowed register in emulate_sysexit()

emulate_sysexit() should use shadowed registers copy instead of
looking into vcpu state directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 291e220c69a..42cb7d71ff5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2063,8 +2063,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
 	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
-	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
-	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+	c->eip = c->regs[VCPU_REGS_RDX];
+	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
 
 	return X86EMUL_CONTINUE;
 }
-- 
cgit v1.2.3


From 4d2179e1e9cb74b25a8181a506600d96e15504fb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:42 +0300
Subject: KVM: x86 emulator: handle shadowed registers outside emulator

Emulator shouldn't access vcpu directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ---------------
 arch/x86/kvm/x86.c     | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 42cb7d71ff5..97a42e8c00d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -941,12 +941,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
 
-	/* Shadow copy of register state. Committed on successful emulation. */
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
@@ -2486,16 +2483,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 	c->dst.type = OP_NONE;
 
 	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE) {
-		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 		rc = writeback(ctxt, ops);
 		if (rc == X86EMUL_CONTINUE)
 			ctxt->eip = c->eip;
@@ -2525,13 +2519,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ctxt->interruptibility = 0;
 	ctxt->decode.mem_read.pos = 0;
 
-	/* Shadow copy of register state. Committed on successful emulation.
-	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-	 * modify them.
-	 */
-
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 		goto done;
@@ -3031,8 +3018,6 @@ writeback:
 	 * without decoding
 	 */
 	ctxt->decode.mem_read.end = 0;
-	/* Commit shadow register state. */
-	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	ctxt->eip = c->eip;
 
 done:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04ca343ee51..21d36081a9d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3844,7 +3844,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			int emulation_type)
 {
 	int r, shadow_mask;
-	struct decode_cache *c;
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
 	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3869,13 +3869,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_VM86 : cs_l
 			? X86EMUL_MODE_PROT64 :	cs_db
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+		memset(c, 0, sizeof(struct decode_cache));
+		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
-		c = &vcpu->arch.emulate_ctxt.decode;
 		if (emulation_type & EMULTYPE_TRAP_UD) {
 			if (!c->twobyte)
 				return EMULATE_FAIL;
@@ -3916,6 +3917,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+	/* this is needed for vmware backdor interface to work since it
+	   changes registers values  during IO operation */
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
@@ -3936,6 +3941,7 @@ restart:
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
@@ -4919,6 +4925,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 	int cs_db, cs_l, ret;
 	cache_all_regs(vcpu);
 
@@ -4933,6 +4940,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		? X86EMUL_MODE_VM86 : cs_l
 		? X86EMUL_MODE_PROT64 :	cs_db
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
 				   tss_selector, reason, has_error_code,
@@ -4941,6 +4950,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 		return EMULATE_FAIL;
 
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	return EMULATE_DONE;
-- 
cgit v1.2.3


From 95cb229530f329ec8002274891793be9c91385f7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:43 +0300
Subject: KVM: x86 emulator: move interruptibility state tracking out of
 emulator

Emulator shouldn't access vcpu directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++-----------------
 arch/x86/kvm/x86.c     | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97a42e8c00d..c40b40561df 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1843,20 +1843,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	return X86EMUL_CONTINUE;
 }
 
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
-	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
-	/*
-	 * an sti; sti; sequence only disable interrupts for the first
-	 * instruction. So, if the last instruction, be it emulated or
-	 * not, left the system with the INT_STI flag enabled, it
-	 * means that the last instruction is an sti. We should not
-	 * leave the flag on in this case. The same goes for mov ss
-	 */
-	if (!(int_shadow & mask))
-		ctxt->interruptibility = mask;
-}
-
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2516,7 +2502,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
 
-	ctxt->interruptibility = 0;
 	ctxt->decode.mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
@@ -2789,7 +2774,7 @@ special_insn:
 		}
 
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
 		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
@@ -2958,7 +2943,7 @@ special_insn:
 		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21d36081a9d..91bfe7771f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3838,12 +3838,26 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = ~0;
 }
 
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+	/*
+	 * an sti; sti; sequence only disable interrupts for the first
+	 * instruction. So, if the last instruction, be it emulated or
+	 * not, left the system with the INT_STI flag enabled, it
+	 * means that the last instruction is an sti. We should not
+	 * leave the flag on in this case. The same goes for mov ss
+	 */
+	if (!(int_shadow & mask))
+		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
 			int emulation_type)
 {
-	int r, shadow_mask;
+	int r;
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
 	kvm_clear_exception_queue(vcpu);
@@ -3871,6 +3885,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 		memset(c, 0, sizeof(struct decode_cache));
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		vcpu->arch.emulate_ctxt.interruptibility = 0;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
@@ -3938,8 +3953,7 @@ restart:
 		return EMULATE_FAIL;
 	}
 
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-- 
cgit v1.2.3


From 54b8486f469475d6c8e8aec917b91239a54eb8c8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:44 +0300
Subject: KVM: x86 emulator: do not inject exception directly into vcpu

Return exception as a result of instruction emulation and handle
injection in KVM code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   6 ++
 arch/x86/kvm/emulate.c             | 124 +++++++++++++++++++++++--------------
 arch/x86/kvm/x86.c                 |  20 +++++-
 3 files changed, 100 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a87d95f0957..51cfd730ac5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -216,6 +216,12 @@ struct x86_emulate_ctxt {
 	int interruptibility;
 
 	bool restart; /* restart string instruction after writeback */
+
+	int exception; /* exception that happens during emulation or -1 */
+	u32 error_code; /* error code for exception */
+	bool error_code_valid;
+	unsigned long cr2; /* faulted address in case of #PF */
+
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c40b40561df..b43ac98ef79 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -653,6 +653,37 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
 	return seg_base(ctxt, ops, VCPU_SREG_SS);
 }
 
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+				      u32 error, bool valid)
+{
+	ctxt->exception = vec;
+	ctxt->error_code = error;
+	ctxt->error_code_valid = valid;
+	ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+		       int err)
+{
+	ctxt->cr2 = addr;
+	emulate_exception(ctxt, PF_VECTOR, err, true);
+}
+
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
@@ -1285,7 +1316,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			kvm_inject_page_fault(ctxt->vcpu, addr, err);
+			emulate_pf(ctxt, addr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1366,13 +1397,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
        return ret;
 }
@@ -1391,14 +1422,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
 	return ret;
 }
@@ -1517,7 +1548,7 @@ load:
 	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
 	return X86EMUL_CONTINUE;
 exception:
-	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	emulate_exception(ctxt, err_vec, err_code, true);
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
@@ -1578,7 +1609,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 		break;
 	case X86EMUL_MODE_VM86:
 		if (iopl < 3) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		change_mask |= EFLG_IF;
@@ -1829,7 +1860,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			kvm_inject_page_fault(ctxt->vcpu,
+			emulate_pf(ctxt,
 					      (unsigned long)c->dst.ptr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
@@ -1883,7 +1914,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1937,7 +1968,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1945,7 +1976,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	* Therefore, we inject an #UD.
 	*/
 	if (ctxt->mode == X86EMUL_MODE_PROT64) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1955,13 +1986,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		break;
 	case X86EMUL_MODE_PROT64:
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		break;
@@ -2004,7 +2035,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -2022,7 +2053,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	case X86EMUL_MODE_PROT32:
 		cs_sel = (u16)(msr_data + 16);
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		ss_sel = (u16)(msr_data + 24);
@@ -2030,7 +2061,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	case X86EMUL_MODE_PROT64:
 		cs_sel = (u16)(msr_data + 32);
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		ss_sel = cs_sel + 8;
@@ -2192,7 +2223,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2202,7 +2233,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2210,7 +2241,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 	}
 
@@ -2223,7 +2254,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 		}
 	}
@@ -2266,7 +2297,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	int ret;
 
 	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	c->eip = tss->eip;
@@ -2334,7 +2365,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2344,7 +2375,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2352,7 +2383,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 	}
 
@@ -2365,7 +2396,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 		}
 	}
@@ -2399,7 +2430,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (reason != TASK_SWITCH_IRET) {
 		if ((tss_selector & 3) > next_tss_desc.dpl ||
 		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 	}
@@ -2408,8 +2439,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (!next_tss_desc.p ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
 	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
-				      tss_selector & 0xfffc);
+		emulate_ts(ctxt, tss_selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -2505,19 +2535,19 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ctxt->decode.mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		goto done;
 	}
 
@@ -2679,7 +2709,7 @@ special_insn:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2691,7 +2721,7 @@ special_insn:
 		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->src.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2754,7 +2784,7 @@ special_insn:
 		goto mov;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
@@ -2769,7 +2799,7 @@ special_insn:
 
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		    c->modrm_reg > VCPU_SREG_GS) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 
@@ -2895,7 +2925,7 @@ special_insn:
 	do_io_in:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -2908,7 +2938,7 @@ special_insn:
 	do_io_out:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2933,7 +2963,7 @@ special_insn:
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 		else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
@@ -2941,7 +2971,7 @@ special_insn:
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 		else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
@@ -3069,7 +3099,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 5: /* not defined */
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3102,7 +3132,7 @@ twobyte_insn:
 		case 1:
 		case 5 ... 7:
 		case 9 ... 15:
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3111,7 +3141,7 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
@@ -3119,7 +3149,7 @@ twobyte_insn:
 		break;
 	case 0x22: /* mov reg, cr */
 		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		c->dst.type = OP_NONE;
@@ -3127,7 +3157,7 @@ twobyte_insn:
 	case 0x23: /* mov from reg to dr */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 
@@ -3135,7 +3165,7 @@ twobyte_insn:
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 
@@ -3146,7 +3176,7 @@ twobyte_insn:
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
@@ -3155,7 +3185,7 @@ twobyte_insn:
 	case 0x32:
 		/* rdmsr */
 		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91bfe7771f5..63c87adcec4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3852,6 +3852,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
 }
 
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	if (ctxt->exception == PF_VECTOR)
+		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+	else if (ctxt->error_code_valid)
+		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+	else
+		kvm_queue_exception(vcpu, ctxt->exception);
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3886,6 +3897,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		memset(c, 0, sizeof(struct decode_cache));
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
+		vcpu->arch.emulate_ctxt.exception = -1;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
@@ -3958,6 +3970,11 @@ restart:
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
+	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+		inject_emulated_exception(vcpu);
+		return EMULATE_DONE;
+	}
+
 	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
@@ -3970,9 +3987,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (vcpu->arch.exception.pending)
-		vcpu->arch.emulate_ctxt.restart = false;
-
 	if (vcpu->arch.emulate_ctxt.restart)
 		goto restart;
 
-- 
cgit v1.2.3


From d94e1dc9af60e3431a586c3edfbe42d8a0d3932b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 16:54:48 +0300
Subject: KVM: Get rid of KVM_REQ_KICK

KVM_REQ_KICK poisons vcpu->requests by having a bit set during normal
operation.  This causes the fast path check for a clear vcpu->requests
to fail all the time, triggering tons of atomic operations.

Fix by replacing KVM_REQ_KICK with a vcpu->guest_mode atomic.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63c87adcec4..fc5611b4007 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4604,13 +4604,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
 
-	local_irq_disable();
+	atomic_set(&vcpu->guest_mode, 1);
+	smp_wmb();
 
-	clear_bit(KVM_REQ_KICK, &vcpu->requests);
-	smp_mb__after_clear_bit();
+	local_irq_disable();
 
-	if (vcpu->requests || need_resched() || signal_pending(current)) {
-		set_bit(KVM_REQ_KICK, &vcpu->requests);
+	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+	    || need_resched() || signal_pending(current)) {
+		atomic_set(&vcpu->guest_mode, 0);
+		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
@@ -4655,7 +4657,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	set_bit(KVM_REQ_KICK, &vcpu->requests);
+	atomic_set(&vcpu->guest_mode, 0);
+	smp_wmb();
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -5580,7 +5583,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+		if (atomic_xchg(&vcpu->guest_mode, 0))
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
-- 
cgit v1.2.3


From 3f10c846f8f9e6a32bbfeefaf7dda7ff51c7da29 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:42 +0200
Subject: KVM: SVM: Dump vmcb contents on failed vmrun

This patch adds a function to dump the vmcb into the kernel
log and calls it after a failed vmrun to ease debugging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd2..685cffff01f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2726,6 +2726,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_NPF]				= pf_interception,
 };
 
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+	struct vmcb_save_area *save = &svm->vmcb->save;
+
+	pr_err("VMCB Control Area:\n");
+	pr_err("cr_read:            %04x\n", control->intercept_cr_read);
+	pr_err("cr_write:           %04x\n", control->intercept_cr_write);
+	pr_err("dr_read:            %04x\n", control->intercept_dr_read);
+	pr_err("dr_write:           %04x\n", control->intercept_dr_write);
+	pr_err("exceptions:         %08x\n", control->intercept_exceptions);
+	pr_err("intercepts:         %016llx\n", control->intercept);
+	pr_err("pause filter count: %d\n", control->pause_filter_count);
+	pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa);
+	pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa);
+	pr_err("tsc_offset:         %016llx\n", control->tsc_offset);
+	pr_err("asid:               %d\n", control->asid);
+	pr_err("tlb_ctl:            %d\n", control->tlb_ctl);
+	pr_err("int_ctl:            %08x\n", control->int_ctl);
+	pr_err("int_vector:         %08x\n", control->int_vector);
+	pr_err("int_state:          %08x\n", control->int_state);
+	pr_err("exit_code:          %08x\n", control->exit_code);
+	pr_err("exit_info1:         %016llx\n", control->exit_info_1);
+	pr_err("exit_info2:         %016llx\n", control->exit_info_2);
+	pr_err("exit_int_info:      %08x\n", control->exit_int_info);
+	pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err);
+	pr_err("nested_ctl:         %lld\n", control->nested_ctl);
+	pr_err("nested_cr3:         %016llx\n", control->nested_cr3);
+	pr_err("event_inj:          %08x\n", control->event_inj);
+	pr_err("event_inj_err:      %08x\n", control->event_inj_err);
+	pr_err("lbr_ctl:            %lld\n", control->lbr_ctl);
+	pr_err("next_rip:           %016llx\n", control->next_rip);
+	pr_err("VMCB State Save Area:\n");
+	pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->es.selector, save->es.attrib,
+		save->es.limit, save->es.base);
+	pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->cs.selector, save->cs.attrib,
+		save->cs.limit, save->cs.base);
+	pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ss.selector, save->ss.attrib,
+		save->ss.limit, save->ss.base);
+	pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ds.selector, save->ds.attrib,
+		save->ds.limit, save->ds.base);
+	pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->fs.selector, save->fs.attrib,
+		save->fs.limit, save->fs.base);
+	pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gs.selector, save->gs.attrib,
+		save->gs.limit, save->gs.base);
+	pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gdtr.selector, save->gdtr.attrib,
+		save->gdtr.limit, save->gdtr.base);
+	pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ldtr.selector, save->ldtr.attrib,
+		save->ldtr.limit, save->ldtr.base);
+	pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->idtr.selector, save->idtr.attrib,
+		save->idtr.limit, save->idtr.base);
+	pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->tr.selector, save->tr.attrib,
+		save->tr.limit, save->tr.base);
+	pr_err("cpl:            %d                efer:         %016llx\n",
+		save->cpl, save->efer);
+	pr_err("cr0:            %016llx cr2:          %016llx\n",
+		save->cr0, save->cr2);
+	pr_err("cr3:            %016llx cr4:          %016llx\n",
+		save->cr3, save->cr4);
+	pr_err("dr6:            %016llx dr7:          %016llx\n",
+		save->dr6, save->dr7);
+	pr_err("rip:            %016llx rflags:       %016llx\n",
+		save->rip, save->rflags);
+	pr_err("rsp:            %016llx rax:          %016llx\n",
+		save->rsp, save->rax);
+	pr_err("star:           %016llx lstar:        %016llx\n",
+		save->star, save->lstar);
+	pr_err("cstar:          %016llx sfmask:       %016llx\n",
+		save->cstar, save->sfmask);
+	pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n",
+		save->kernel_gs_base, save->sysenter_cs);
+	pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n",
+		save->sysenter_esp, save->sysenter_eip);
+	pr_err("gpat:           %016llx dbgctl:       %016llx\n",
+		save->g_pat, save->dbgctl);
+	pr_err("br_from:        %016llx br_to:        %016llx\n",
+		save->br_from, save->br_to);
+	pr_err("excp_from:      %016llx excp_to:      %016llx\n",
+		save->last_excp_from, save->last_excp_to);
+
+}
+
 static int handle_exit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2863,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= svm->vmcb->control.exit_code;
+		pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+		dump_vmcb(vcpu);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From eec4b140c924b4c650e9a89e01d223266490e325 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:44 +0200
Subject: KVM: SVM: Allow EFER.LMSLE to be set with nested svm

This patch enables setting of efer bit 13 which is allowed
in all SVM capable processors. This is necessary for the
SLES11 version of Xen 4.0 to boot with nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 2 ++
 arch/x86/kvm/svm.c               | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae431862..509a42187dc 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
 #define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
 #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE		(1<<_EFER_SCE)
@@ -27,6 +28,7 @@
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
 #define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_LMSLE		(1<<_EFER_LMSLE)
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 685cffff01f..41fe0381a1a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
 
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
-		kvm_enable_efer_bits(EFER_SVME);
+		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 
 	for_each_possible_cpu(cpu) {
-- 
cgit v1.2.3


From f3b8c964a9a6cfef6d3ca778648d53947b9fd257 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:09:21 +0800
Subject: KVM: MMU: mark page table dirty when a pte is actually modified

Sometime cmpxchg_gpte doesn't modify gpte, in such case, don't mark
page table page as dirty.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c7f27779c99..5c8ac060442 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -177,10 +177,10 @@ walk:
 		if (!(pte & PT_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
-			mark_page_dirty(vcpu->kvm, table_gfn);
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
 			    index, pte, pte|PT_ACCESSED_MASK))
 				goto walk;
+			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
 		}
 
@@ -217,11 +217,11 @@ walk:
 		bool ret;
 
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-		mark_page_dirty(vcpu->kvm, table_gfn);
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 			    pte|PT_DIRTY_MASK);
 		if (ret)
 			goto walk;
+		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;
 		walker->ptes[walker->level - 1] = pte;
 	}
-- 
cgit v1.2.3


From 518c5a05e89a79e498c95c3e29f29bd236b3c972 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:58:33 +0800
Subject: KVM: MMU: Fix debug output error in walk_addr()

Fix a debug output error in walk_addr

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5c8ac060442..15e379eaf3b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -229,7 +229,7 @@ walk:
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __func__, (u64)pte, pt_access, pte_access);
+		 __func__, (u64)pte, pte_access, pt_access);
 	return 1;
 
 not_present:
-- 
cgit v1.2.3


From 54a4f0239f2e98bc0842818f611a4cf73bb7dd35 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:03:49 +0800
Subject: KVM: MMU: make kvm_mmu_zap_page() return the number of pages it
 actually freed

Currently, kvm_mmu_zap_page() returning the number of freed children sp.
This might confuse the caller, because caller don't know the actual freed
number. Let's make kvm_mmu_zap_page() return the number of pages it actually
freed.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b666d8d106a..be981b1f188 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1504,6 +1504,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
+		/* Count self */
+		ret++;
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
 	} else {
@@ -1540,7 +1542,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 			used_pages -= kvm_mmu_zap_page(kvm, page);
-			used_pages--;
 		}
 		kvm_nr_mmu_pages = used_pages;
 		kvm->arch.n_free_mmu_pages = 0;
@@ -2941,7 +2942,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_zap_page(kvm, page) + 1;
+	return kvm_mmu_zap_page(kvm, page);
 }
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
-- 
cgit v1.2.3


From 6d77dbfc88e37c9efd5c5dd18445cfe819ae17ea Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 10 May 2010 11:16:56 +0300
Subject: KVM: inject #UD if instruction emulation fails and exit to userspace

Do not kill VM when instruction emulation fails. Inject #UD and report
failure to userspace instead. Userspace may choose to reenter guest if
vcpu is in userspace (cpl == 3) in which case guest OS will kill
offending process and continue running.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c              |  5 +----
 arch/x86/kvm/svm.c              | 10 +++-------
 arch/x86/kvm/vmx.c              | 28 ++++-----------------------
 arch/x86/kvm/x86.c              | 43 +++++++++++++++++------------------------
 5 files changed, 26 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2ca1867ed97..0c06148fa3b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -576,7 +576,6 @@ enum emulation_result {
 #define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index be981b1f188..4a02dee1f2b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2814,11 +2814,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 		return 1;
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
-		return 0;
+		/* fall through */
 	case EMULATE_FAIL:
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-		vcpu->run->internal.ndata = 0;
 		return 0;
 	default:
 		BUG();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41fe0381a1a..134260c36ce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1535,7 +1535,7 @@ static int io_interception(struct vcpu_svm *svm)
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -2386,16 +2386,12 @@ static int iret_interception(struct vcpu_svm *svm)
 
 static int invlpg_interception(struct vcpu_svm *svm)
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int cr8_write_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 59893173425..a82cfa1e2a4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3070,7 +3070,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	++vcpu->stat.io_exits;
 
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
@@ -3327,22 +3327,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qualification;
-	enum emulation_result er;
-	unsigned long offset;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	offset = exit_qualification & 0xffful;
-
-	er = emulate_instruction(vcpu, 0, 0, 0);
-
-	if (er !=  EMULATE_DONE) {
-		printk(KERN_ERR
-		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
-		       offset);
-		return -ENOEXEC;
-	}
-	return 1;
+	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3554,13 +3539,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			goto out;
 		}
 
-		if (err != EMULATE_DONE) {
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
-			ret = 0;
-			goto out;
-		}
+		if (err != EMULATE_DONE)
+			return 0;
 
 		if (signal_pending(current))
 			goto out;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc5611b4007..ae9d6f3e5d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3639,24 +3639,6 @@ int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 	return __kvm_set_dr(vcpu, dr, value);
 }
 
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-	u8 opcodes[4];
-	unsigned long rip = kvm_rip_read(vcpu);
-	unsigned long rip_linear;
-
-	if (!printk_ratelimit())
-		return;
-
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
-	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
@@ -3863,6 +3845,19 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception);
 }
 
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	++vcpu->stat.insn_emulation_fail;
+	trace_kvm_emulate_insn_failed(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+	vcpu->run->internal.ndata = 0;
+	kvm_queue_exception(vcpu, UD_VECTOR);
+	return EMULATE_FAIL;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3931,11 +3926,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		++vcpu->stat.insn_emulation;
 		if (r)  {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 				return EMULATE_DONE;
-			return EMULATE_FAIL;
+			if (emulation_type & EMULTYPE_SKIP)
+				return EMULATE_FAIL;
+			return handle_emulation_failure(vcpu);
 		}
 	}
 
@@ -3960,9 +3955,7 @@ restart:
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			return EMULATE_DONE;
 
-		trace_kvm_emulate_insn_failed(vcpu);
-		kvm_report_emulation_failure(vcpu, "mmio");
-		return EMULATE_FAIL;
+		return handle_emulation_failure(vcpu);
 	}
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
@@ -4798,7 +4791,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
+		if (r != EMULATE_DONE) {
 			r = 0;
 			goto out;
 		}
-- 
cgit v1.2.3


From f0f5933a1626c8df7b0bfd227819c66320fb4f0f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 12:09:56 +0300
Subject: KVM: MMU: Fix free memory accounting race in mmu_alloc_roots()

We drop the mmu lock between freeing memory and allocating the roots; this
allows some other vcpu to sneak in and allocate memory.

While the race is benign (resulting only in temporary overallocation, not oom)
it is simple and easy to fix by moving the freeing close to the allocation.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a02dee1f2b..d7aebafffdf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2094,6 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu->kvm);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2124,6 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu->kvm);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2496,9 +2498,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
-- 
cgit v1.2.3


From 7725b89414836df492d6222b1d3cacb0ca576d77 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:38 +0800
Subject: KVM: VMX: Define new functions to wrapper direct call of asm code

Define vmcs_load() and kvm_cpu_vmxon() to avoid direct call of asm
code. Also move VMXE bit operation out of kvm_cpu_vmxoff().

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a82cfa1e2a4..82328882144 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -453,6 +453,19 @@ static void vmcs_clear(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
+static void vmcs_load(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(vmcs);
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+			: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+			: "cc", "memory");
+	if (error)
+		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+		       vmcs, phys_addr);
+}
+
 static void __vcpu_clear(void *arg)
 {
 	struct vcpu_vmx *vmx = arg;
@@ -830,7 +843,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(vmx->vmcs);
 	u64 tsc_this, delta, new_offset;
 
 	if (vcpu->cpu != cpu) {
@@ -844,15 +856,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmcs_load(vmx->vmcs);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -1288,6 +1293,13 @@ static __init int vmx_disabled_by_bios(void)
 	/* locked but not enabled */
 }
 
+static void kvm_cpu_vmxon(u64 addr)
+{
+	asm volatile (ASM_VMX_VMXON_RAX
+			: : "a"(&addr), "m"(addr)
+			: "memory", "cc");
+}
+
 static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
@@ -1310,9 +1322,7 @@ static int hardware_enable(void *garbage)
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	asm volatile (ASM_VMX_VMXON_RAX
-		      : : "a"(&phys_addr), "m"(phys_addr)
-		      : "memory", "cc");
+	kvm_cpu_vmxon(phys_addr);
 
 	ept_sync_global();
 
@@ -1336,13 +1346,13 @@ static void vmclear_local_vcpus(void)
 static void kvm_cpu_vmxoff(void)
 {
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
 static void hardware_disable(void *garbage)
 {
 	vmclear_local_vcpus();
 	kvm_cpu_vmxoff();
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-- 
cgit v1.2.3


From 92fe13be74303a7b80dc3c99e22e12a87d41bd5f Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:42 +0800
Subject: KVM: VMX: Some minor changes to code structure

Do some preparations for vmm coexistence support.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 82328882144..0d281dbc008 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -845,15 +845,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
 
-	if (vcpu->cpu != cpu) {
+	if (vcpu->cpu != cpu)
 		vcpu_clear(vmx);
-		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
-		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
-		local_irq_enable();
-	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
@@ -864,6 +857,13 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 
+		kvm_migrate_timers(vcpu);
+		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+		local_irq_disable();
+		list_add(&vmx->local_vcpus_link,
+			 &per_cpu(vcpus_on_cpu, cpu));
+		local_irq_enable();
+
 		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
-- 
cgit v1.2.3


From b923e62e4d48bc5242b32a6ef5ba0f886137668a Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:45 +0800
Subject: KVM: VMX: VMCLEAR/VMPTRLD usage changes

Originally VMCLEAR/VMPTRLD is called on vcpu migration. To
support hosted VMM coexistance, VMCLEAR is executed on vcpu
schedule out, and VMPTRLD is executed on vcpu schedule in.
This could also eliminate the IPI when doing VMCLEAR.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0d281dbc008..9529bff0426 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -63,6 +63,9 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -845,7 +848,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
 
-	if (vcpu->cpu != cpu)
+	if (vmm_exclusive && vcpu->cpu != cpu)
 		vcpu_clear(vmx);
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -891,6 +894,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
+	if (!vmm_exclusive)
+		__vcpu_clear(to_vmx(vcpu));
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 4610c9cc6d9c84f7d585583699f04d5f51c83671 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:48 +0800
Subject: KVM: VMX: VMXON/VMXOFF usage changes

SDM suggests VMXON should be called before VMPTRLD, and VMXOFF
should be called after doing VMCLEAR.

Therefore in vmm coexistence case, we should firstly call VMXON
before any VMCS operation, and then call VMXOFF after the
operation is done.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9529bff0426..b8aac4e9890 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -176,6 +176,8 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -847,8 +849,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
+	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
-	if (vmm_exclusive && vcpu->cpu != cpu)
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+	else if (vcpu->cpu != cpu)
 		vcpu_clear(vmx);
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -894,8 +899,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
-	if (!vmm_exclusive)
+	if (!vmm_exclusive) {
 		__vcpu_clear(to_vmx(vcpu));
+		kvm_cpu_vmxoff();
+	}
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1327,9 +1334,11 @@ static int hardware_enable(void *garbage)
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	kvm_cpu_vmxon(phys_addr);
 
-	ept_sync_global();
+	if (vmm_exclusive) {
+		kvm_cpu_vmxon(phys_addr);
+		ept_sync_global();
+	}
 
 	return 0;
 }
@@ -1355,8 +1364,10 @@ static void kvm_cpu_vmxoff(void)
 
 static void hardware_disable(void *garbage)
 {
-	vmclear_local_vcpus();
-	kvm_cpu_vmxoff();
+	if (vmm_exclusive) {
+		vmclear_local_vcpus();
+		kvm_cpu_vmxoff();
+	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
@@ -3991,6 +4002,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+
+	vmcs_clear(vmcs);
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
+}
+
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -4016,7 +4040,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx->vmcs)
 		goto free_msrs;
 
-	vmcs_clear(vmx->vmcs);
+	vmcs_init(vmx->vmcs);
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
-- 
cgit v1.2.3


From 24955b6c906045382b67f3e6beba7e5df4a4a045 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 12 May 2010 21:00:35 -0300
Subject: KVM: pass correct parameter to kvm_mmu_free_some_pages

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d7aebafffdf..a455c5eee37 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2094,7 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu->kvm);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2125,7 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu->kvm);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
-- 
cgit v1.2.3


From dfb507c41d0d12fc69820abb7f040d31fcf015fe Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Tue, 11 May 2010 22:22:40 +0300
Subject: KVM: x86 emulator: Add test acc, imm instruction (opcodes 0xA8 -
 0xA9)

This adds test acc, imm instruction to the x86 emulator

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b43ac98ef79..35dd57c5a7f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -181,7 +181,7 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
@@ -2754,6 +2754,7 @@ special_insn:
 		}
 		break;
 	case 0x84 ... 0x85:
+	test:
 		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x86 ... 0x87:	/* xchg */
@@ -2852,6 +2853,8 @@ special_insn:
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
 		goto cmp;
+	case 0xa8 ... 0xa9:	/* test ax, imm */
+		goto test;
 	case 0xaa ... 0xab:	/* stos */
 		c->dst.val = c->regs[VCPU_REGS_RAX];
 		break;
-- 
cgit v1.2.3


From abc190830f28a5bb678eaccb633de02ed2967d55 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 12 May 2010 01:39:21 +0300
Subject: KVM: x86 emulator: Add missing decoder flags for sub instruction

This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 35dd57c5a7f..1b974f80e1e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -126,7 +126,7 @@ static u32 opcode_table[256] = {
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x30 - 0x37 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From 222b7c52c33bdef721248bfeba992af495800d30 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 12 May 2010 01:39:22 +0300
Subject: KVM: x86 emulator: Add missing decoder flags for xor instructions

This adds missing decoder flags for xor instructions (opcodes 0x34 - 0x35)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1b974f80e1e..7a36eec8bab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -130,7 +130,7 @@ static u32 opcode_table[256] = {
 	/* 0x30 - 0x37 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x38 - 0x3F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From 62ad07551a2ace89e35604d1c55fdae1dd3359a8 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 12 May 2010 16:40:41 +0800
Subject: KVM: x86: Clean up duplicate assignment

mmu.free() already set root_hpa to INVALID_PAGE, no need to do it again in the
destory_kvm_mmu().

kvm_x86_ops->set_cr4() and set_efer() already assign cr4/efer to
vcpu->arch.cr4/efer, no need to do it again later.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++---
 arch/x86/kvm/x86.c | 4 +---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a455c5eee37..c075542648c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2478,10 +2478,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		/* mmu.free() should set root_hpa = INVALID_PAGE */
 		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae9d6f3e5d0..03039fd8698 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -486,7 +486,7 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return 1;
 
 	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->arch.cr4 = cr4;
+
 	kvm_mmu_reset_context(vcpu);
 
 	return 0;
@@ -721,8 +721,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	kvm_x86_ops->set_efer(vcpu, efer);
 
-	vcpu->arch.efer = efer;
-
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 
-- 
cgit v1.2.3


From aad827034e419fa8c5ec39e6455266f0b942d856 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 12 May 2010 16:40:42 +0800
Subject: KVM: VMX: Only reset MMU when necessary

Only modifying some bits of CR0/CR4 needs paging mode switch.

Modify EFER.NXE bit would result in reserved bit updates.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03039fd8698..78147f0421a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -416,6 +416,10 @@ out:
 
 static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long old_cr0 = kvm_read_cr0(vcpu);
+	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+				    X86_CR0_CD | X86_CR0_NW;
+
 	cr0 |= X86_CR0_ET;
 
 #ifdef CONFIG_X86_64
@@ -449,7 +453,8 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
-	kvm_mmu_reset_context(vcpu);
+	if ((cr0 ^ old_cr0) & update_bits)
+		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 
@@ -487,7 +492,8 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 
-	kvm_mmu_reset_context(vcpu);
+	if ((cr4 ^ old_cr4) & pdptr_bits)
+		kvm_mmu_reset_context(vcpu);
 
 	return 0;
 }
@@ -693,6 +699,8 @@ static u32 emulated_msrs[] = {
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+	u64 old_efer = vcpu->arch.efer;
+
 	if (efer & efer_reserved_bits)
 		return 1;
 
@@ -724,6 +732,10 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 
+	/* Update reserved bits */
+	if ((efer ^ old_efer) & EFER_NX)
+		kvm_mmu_reset_context(vcpu);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From e8ad9a707496c163312bcdd6aa3b90603d45dc9b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:06:02 +0800
Subject: KVM: MMU: use proper cache object freeing function

Use kmem_cache_free to free objects allocated by kmem_cache_alloc.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c075542648c..bb48b0ca5f8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -305,10 +305,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 }
 
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+				  struct kmem_cache *cache)
 {
 	while (mc->nobjs)
-		kfree(mc->objects[--mc->nobjs]);
+		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 }
 
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -356,10 +357,11 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+				mmu_page_header_cache);
 }
 
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -380,7 +382,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 
 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 {
-	kfree(pc);
+	kmem_cache_free(pte_chain_cache, pc);
 }
 
 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -391,7 +393,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 
 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 {
-	kfree(rd);
+	kmem_cache_free(rmap_desc_cache, rd);
 }
 
 /*
@@ -898,7 +900,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
 	__free_page(virt_to_page(sp->gfns));
-	kfree(sp);
+	kmem_cache_free(mmu_page_header_cache, sp);
 	++kvm->arch.n_free_mmu_pages;
 }
 
-- 
cgit v1.2.3


From 6d74229f013ed8e4a00d74cfa7a3fa6a2315c467 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:07:00 +0800
Subject: KVM: MMU: remove rmap before clear spte

Remove rmap before clear spte otherwise it will trigger BUG_ON() in
some functions such as rmap_write_protect().

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bb48b0ca5f8..5c9d6df0113 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1813,6 +1813,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
+			rmap_remove(vcpu->kvm, sptep);
 			spte = shadow_trap_nonpresent_pte;
 			goto set_pte;
 		}
-- 
cgit v1.2.3


From f55c3f419ab1f0a9d66f44ceeefe752975ae4233 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:08:08 +0800
Subject: KVM: MMU: unalias gfn before sp->gfns[] comparison in sync_page

sp->gfns[] contain unaliased gfns, but gpte might contain pointer
to aliased region.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 15e379eaf3b..22f13797f52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -586,7 +586,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		unsigned pte_access;
 		pt_element_t gpte;
 		gpa_t pte_gpa;
-		gfn_t gfn = sp->gfns[i];
+		gfn_t gfn;
 
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
@@ -597,8 +597,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
-		    !(gpte & PT_ACCESSED_MASK)) {
+		gfn = gpte_to_gfn(gpte);
+		if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
+		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
 			rmap_remove(vcpu->kvm, &sp->spt[i]);
-- 
cgit v1.2.3


From 1683b2416e4c514d30ff5844a06733d0444ee000 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:09:57 +0800
Subject: KVM: x86: cleanup unused local variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:
 arch/x86/kvm/x86.c: In function ‘handle_emulation_failure’:
 arch/x86/kvm/x86.c:3844: warning: unused variable ‘ctxt’

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 78147f0421a..b05321adfd2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3857,8 +3857,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
-	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-- 
cgit v1.2.3


From 2122ff5eab8faec853e43f6de886e8dc8f31e317 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 11:25:04 +0300
Subject: KVM: move vcpu locking to dispatcher for generic vcpu ioctls

All vcpu ioctls need to be locked, so instead of locking each one specifically
we lock at the generic dispatcher.

This patch only updates generic ioctls and leaves arch specific ioctls alone.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 40 ++--------------------------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b05321adfd2..5acd21245fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4773,8 +4773,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -4815,14 +4813,11 @@ out:
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	vcpu_put(vcpu);
 	return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4845,15 +4840,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4878,8 +4869,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vcpu->arch.exception.pending = false;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -4898,8 +4887,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 {
 	struct desc_ptr dt;
 
-	vcpu_load(vcpu);
-
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4931,26 +4918,20 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu_load(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu_load(vcpu);
 	vcpu->arch.mp_state = mp_state->mp_state;
-	vcpu_put(vcpu);
 	return 0;
 }
 
@@ -4996,8 +4977,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	int pending_vec, max_bits;
 	struct desc_ptr dt;
 
-	vcpu_load(vcpu);
-
 	dt.size = sregs->idt.limit;
 	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
@@ -5057,8 +5036,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -5068,12 +5045,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	unsigned long rflags;
 	int i, r;
 
-	vcpu_load(vcpu);
-
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
 		if (vcpu->arch.exception.pending)
-			goto unlock_out;
+			goto out;
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 			kvm_queue_exception(vcpu, DB_VECTOR);
 		else
@@ -5115,8 +5090,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 	r = 0;
 
-unlock_out:
-	vcpu_put(vcpu);
+out:
 
 	return r;
 }
@@ -5152,7 +5126,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	gpa_t gpa;
 	int idx;
 
-	vcpu_load(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5160,7 +5133,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->usermode = 0;
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -5169,8 +5141,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
 
-	vcpu_load(vcpu);
-
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -5180,8 +5150,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fpu->last_dp = fxsave->rdp;
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -5189,8 +5157,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
 
-	vcpu_load(vcpu);
-
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
 	fxsave->swd = fpu->fsw;
@@ -5200,8 +5166,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fxsave->rdp = fpu->last_dp;
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 526b78ad1a9e66ef240ad7c757988de039e42229 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 11:53:06 +0300
Subject: KVM: x86: Lock arch specific vcpu ioctls centrally

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5acd21245fc..999b017011f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1541,16 +1541,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 {
 	int i, idx;
 
-	vcpu_load(vcpu);
-
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
-	vcpu_put(vcpu);
-
 	return i;
 }
 
@@ -1798,7 +1794,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (copy_from_user(cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out_free;
-	vcpu_load(vcpu);
 	for (i = 0; i < cpuid->nent; i++) {
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1816,7 +1811,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1837,11 +1831,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
-	vcpu_load(vcpu);
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
 	return 0;
 
 out:
@@ -1854,7 +1846,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 	int r;
 
-	vcpu_load(vcpu);
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
@@ -1866,7 +1857,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
-	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2098,9 +2088,7 @@ out:
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	vcpu_load(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2108,11 +2096,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	vcpu_load(vcpu);
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
 	kvm_apic_post_state_restore(vcpu);
 	update_cr8_intercept(vcpu);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2124,20 +2110,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
-	vcpu_load(vcpu);
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
-	vcpu_load(vcpu);
 	kvm_inject_nmi(vcpu);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2157,7 +2138,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
-	vcpu_load(vcpu);
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
@@ -2172,7 +2152,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
-	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2230,8 +2209,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 {
-	vcpu_load(vcpu);
-
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2256,8 +2233,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
-
-	vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2268,8 +2243,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
-	vcpu_load(vcpu);
-
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2292,22 +2265,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 					     struct kvm_debugregs *dbgregs)
 {
-	vcpu_load(vcpu);
-
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
-
-	vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2316,14 +2283,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	if (dbgregs->flags)
 		return -EINVAL;
 
-	vcpu_load(vcpu);
-
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = dbgregs->dr6;
 	vcpu->arch.dr7 = dbgregs->dr7;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -2335,6 +2298,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
 
+	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
@@ -2481,9 +2445,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
-		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
@@ -2534,6 +2496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
+	vcpu_put(vcpu);
 	kfree(lapic);
 	return r;
 }
-- 
cgit v1.2.3


From 93736624635235cc5372ffca6d62816d02170724 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 12:35:17 +0300
Subject: KVM: Consolidate arch specific vcpu ioctl locking

Now that all arch specific ioctls have centralized locking, it is easy to
move it to the central dispatcher.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 999b017011f..4c2096f30d9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2298,7 +2298,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
 
-	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
@@ -2496,7 +2495,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
-	vcpu_put(vcpu);
 	kfree(lapic);
 	return r;
 }
-- 
cgit v1.2.3


From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:22:23 +0800
Subject: x86: Export FPU API for KVM use

Also add some constants.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/i387.h  | 2 ++
 arch/x86/include/asm/xsave.h | 3 +++
 arch/x86/kernel/i387.c       | 3 ++-
 arch/x86/kernel/process.c    | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..815c5b2b9f5 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
 	memcpy(dst->state, src->state, xstate_size);
 }
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..29ee4e4c64c 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,9 @@
 
 #define FXSAVE_SIZE	512
 
+#define XSTATE_YMM_SIZE 256
+#define XSTATE_YMM_OFFSET (512 + 64)
+
 /*
  * These are the features that the OS can handle currently.
  */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b3225..c4444bce846 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
 		fp->fos = 0xffff0000u;
 	}
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 /*
  * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..ebcfcceccc7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-- 
cgit v1.2.3


From 7cf30855e02be7a207ffebb8b9350986f2ba83e9 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:08:27 +0800
Subject: KVM: x86: Use unlazy_fpu() for host FPU

We can avoid unnecessary fpu load when userspace process
didn't use FPU frequently.

Derived from Avi's idea.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/x86.c              | 18 ++----------------
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c06148fa3b..d93601c5290 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -301,7 +301,6 @@ struct kvm_vcpu_arch {
 		unsigned long mmu_seq;
 	} update_pte;
 
-	struct i387_fxsave_struct host_fx_image;
 	struct i387_fxsave_struct guest_fx_image;
 
 	gva_t mmio_fault_cr2;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c2096f30d9..54ce77582ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -52,6 +52,7 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -5134,21 +5135,10 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
 	unsigned after_mxcsr_mask;
 
-	/*
-	 * Touch the fpu the first time in non atomic context as if
-	 * this is the first fpu instruction the exception handler
-	 * will fire before the instruction returns and it'll have to
-	 * allocate ram with GFP_KERNEL.
-	 */
-	if (!used_math())
-		kvm_fx_save(&vcpu->arch.host_fx_image);
-
 	/* Initialize guest FPU by resetting ours and saving into guest's */
 	preempt_disable();
-	kvm_fx_save(&vcpu->arch.host_fx_image);
 	kvm_fx_finit();
 	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
 	preempt_enable();
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
@@ -5165,7 +5155,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu->guest_fpu_loaded = 1;
-	kvm_fx_save(&vcpu->arch.host_fx_image);
+	unlazy_fpu(current);
 	kvm_fx_restore(&vcpu->arch.guest_fx_image);
 	trace_kvm_fpu(1);
 }
@@ -5177,7 +5167,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
 	vcpu->guest_fpu_loaded = 0;
 	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
 	++vcpu->stat.fpu_reload;
 	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
 	trace_kvm_fpu(0);
@@ -5203,9 +5192,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	/* We do fxsave: this must be aligned. */
-	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
 	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu_load(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
-- 
cgit v1.2.3


From 98918833a3e21ffc5619535955e7a003cb788163 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:08:28 +0800
Subject: KVM: x86: Use FPU API

Convert KVM to use generic FPU API.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 17 +-------------
 arch/x86/kvm/x86.c              | 52 +++++++++++++----------------------------
 2 files changed, 17 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d93601c5290..d08bb4a202d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -301,7 +301,7 @@ struct kvm_vcpu_arch {
 		unsigned long mmu_seq;
 	} update_pte;
 
-	struct i387_fxsave_struct guest_fx_image;
+	struct fpu guest_fpu;
 
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
@@ -708,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 #endif
 
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-	asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-	asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
-	asm("finit");
-}
-
 static inline u32 get_rdx_init_val(void)
 {
 	return 0x600; /* P6 family */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54ce77582ed..84b1788489d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -53,6 +53,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/i387.h>
+#include <asm/xcr.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -5057,27 +5058,6 @@ out:
 	return r;
 }
 
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
 /*
  * Translate a guest virtual address to a guest physical address.
  */
@@ -5101,7 +5081,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
@@ -5117,7 +5098,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -5133,22 +5115,18 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 void fx_init(struct kvm_vcpu *vcpu)
 {
-	unsigned after_mxcsr_mask;
-
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	preempt_disable();
-	kvm_fx_finit();
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	preempt_enable();
+	fpu_alloc(&vcpu->arch.guest_fpu);
+	fpu_finit(&vcpu->arch.guest_fpu);
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+	fpu_free(&vcpu->arch.guest_fpu);
+}
+
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_fpu_loaded)
@@ -5156,7 +5134,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 	vcpu->guest_fpu_loaded = 1;
 	unlazy_fpu(current);
-	kvm_fx_restore(&vcpu->arch.guest_fx_image);
+	fpu_restore_checking(&vcpu->arch.guest_fpu);
 	trace_kvm_fpu(1);
 }
 
@@ -5166,7 +5144,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu->guest_fpu_loaded = 0;
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
+	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
 	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
 	trace_kvm_fpu(0);
@@ -5179,6 +5157,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 	}
 
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -5213,6 +5192,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-- 
cgit v1.2.3


From 1d9dc7e000915b9607b480e34fcb4238b789fbb1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:51:24 +0800
Subject: KVM: MMU: split kvm_sync_page() function

Split kvm_sync_page() into kvm_sync_page() and kvm_sync_page_transient()
to clarify the code address Avi's suggestion

kvm_sync_page_transient() function only update shadow page but not mark
it sync and not write protect sp->gfn. it will be used by later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c9d6df0113..ef5d140a270 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1199,16 +1199,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			   bool clear_unsync)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
 
-	if (rmap_write_protect(vcpu->kvm, sp->gfn))
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	if (clear_unsync) {
+		if (rmap_write_protect(vcpu->kvm, sp->gfn))
+			kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_unlink_unsync_page(vcpu->kvm, sp);
+	}
+
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
@@ -1218,6 +1222,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return 0;
 }
 
+static void mmu_convert_notrap(struct kvm_mmu_page *sp);
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+				   struct kvm_mmu_page *sp)
+{
+	int ret;
+
+	ret = __kvm_sync_page(vcpu, sp, false);
+	if (!ret)
+		mmu_convert_notrap(sp);
+	return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	return __kvm_sync_page(vcpu, sp, true);
+}
+
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
-- 
cgit v1.2.3


From e02aa901b1aa41fb541521800cc2a4774c162485 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:52:34 +0800
Subject: KVM: MMU: don't write-protect if have new mapping to unsync page

Two cases maybe happen in kvm_mmu_get_page() function:

- one case is, the goal sp is already in cache, if the sp is unsync,
  we only need update it to assure this mapping is valid, but not
  mark it sync and not write-protect sp->gfn since it not broke unsync
  rule(one shadow page for a gfn)

- another case is, the goal sp not existed, we need create a new sp
  for gfn, i.e, gfn (may)has another shadow page, to keep unsync rule,
  we should sync(mark sync and write-protect) gfn's unsync shadow page.
  After enabling multiple unsync shadows, we sync those shadow pages
  only when the new sp not allow to become unsync(also for the unsyc
  rule, the new rule is: allow all pte page become unsync)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ef5d140a270..064ddfbde10 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1337,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned index;
 	unsigned quadrant;
 	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
+	struct kvm_mmu_page *sp, *unsync_sp = NULL;
 	struct hlist_node *node, *tmp;
 
 	role = vcpu->arch.mmu.base_role;
@@ -1356,20 +1356,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
 		if (sp->gfn == gfn) {
 			if (sp->unsync)
-				if (kvm_sync_page(vcpu, sp))
-					continue;
+				unsync_sp = sp;
 
 			if (sp->role.word != role.word)
 				continue;
 
+			if (!direct && unsync_sp &&
+			      kvm_sync_page_transient(vcpu, unsync_sp)) {
+				unsync_sp = NULL;
+				break;
+			}
+
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
 				kvm_mmu_mark_parents_unsync(sp);
-			}
+			} else if (sp->unsync)
+				kvm_mmu_mark_parents_unsync(sp);
+
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
 		}
+	if (!direct && unsync_sp)
+		kvm_sync_page(vcpu, unsync_sp);
+
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 	if (!sp)
-- 
cgit v1.2.3


From f78978aa3a8222f7822f15fba5dbaea990ef0887 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:53:35 +0800
Subject: KVM: MMU: only update unsync page in invlpg path

Only unsync pages need updated at invlpg time since other shadow
pages are write-protected

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 22f13797f52..0671d7a29c3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
@@ -472,10 +473,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		sptep = iterator.sptep;
 
+		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
-			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 			int offset, shift;
 
+			if (!sp->unsync)
+				break;
+
 			shift = PAGE_SHIFT -
 				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
 			offset = sp->role.quadrant << shift;
@@ -493,7 +497,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 		}
 
-		if (!is_shadow_present_pte(*sptep))
+		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
 
-- 
cgit v1.2.3


From 9fb2d2b4ff292a01ae30da003d1dc097917b0988 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 23 May 2010 14:28:26 +0300
Subject: KVM: SVM: correctly trace irq injection

On SVM interrupts are injected by svm_set_irq() not svm_inject_irq().
The later is used only to wait for irq window.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 134260c36ce..f5c2b432078 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2917,9 +2917,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
 	struct vmcb_control_area *control;
 
-	trace_kvm_inj_virq(irq);
-
-	++svm->vcpu.stat.irq_injections;
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2933,6 +2930,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
 
 	BUG_ON(!(gif_set(svm)));
 
+	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+	++vcpu->stat.irq_injections;
+
 	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
-- 
cgit v1.2.3


From 221d059d15f1c8bd070a63fd45cd8d2598af5f99 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 23 May 2010 18:37:00 +0300
Subject: KVM: Update Red Hat copyrights

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c     |  1 +
 arch/x86/kvm/i8254.c       |  1 +
 arch/x86/kvm/i8259.c       |  1 +
 arch/x86/kvm/irq.c         |  1 +
 arch/x86/kvm/lapic.c       |  1 +
 arch/x86/kvm/mmu.c         |  1 +
 arch/x86/kvm/paging_tmpl.h |  1 +
 arch/x86/kvm/svm.c         |  1 +
 arch/x86/kvm/timer.c       | 14 ++++++++++++++
 arch/x86/kvm/vmx.c         |  1 +
 arch/x86/kvm/x86.c         |  1 +
 11 files changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7a36eec8bab..a4c2dcd1032 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25..188d82762c1 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338..2c73f449314 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a..0f4e488331c 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9..d8258a0060f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 064ddfbde10..25d3bb2543e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0671d7a29c3..167f53357ee 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f5c2b432078..02ea5cf4b1e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ff..564548fbb3d 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/hrtimer.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8aac4e9890..9c3ffc5fde4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84b1788489d..033b9c207f9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
-- 
cgit v1.2.3


From 9cf5cf5ad43b293581e5b87678ea5783c06d1a41 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 24 May 2010 15:40:07 +0800
Subject: KVM: MMU: allow more page become unsync at gfn mapping time

In current code, shadow page can become asynchronous only if one
shadow page for a gfn, this rule is too strict, in fact, we can
let all last mapping page(i.e, it's the pte page) become unsync,
and sync them at invlpg or flush tlb time.

This patch allow more page become asynchronous at gfn mapping time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 25d3bb2543e..ba119dae890 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1170,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	return __mmu_unsync_walk(sp, pvec);
 }
 
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: found role %x\n",
-				 __func__, sp->role.word);
-			return sp;
-		}
-	return NULL;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
@@ -1759,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	trace_kvm_mmu_unsync_page(sp);
+	++vcpu->kvm->stat.mmu_unsync;
+	sp->unsync = 1;
+
+	kvm_mmu_mark_parents_unsync(sp);
+	mmu_convert_notrap(sp);
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	unsigned index;
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
+	unsigned index;
 
-	index = kvm_page_table_hashfn(sp->gfn);
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	/* don't unsync if pagetable is shadowed with multiple roles */
+
 	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.direct)
+		if (s->gfn != gfn || s->role.direct || s->unsync ||
+		      s->role.invalid)
 			continue;
-		if (s->role.word != sp->role.word)
-			return 1;
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		__kvm_unsync_page(vcpu, s);
 	}
-	trace_kvm_mmu_unsync_page(sp);
-	++vcpu->kvm->stat.mmu_unsync;
-	sp->unsync = 1;
-
-	kvm_mmu_mark_parents_unsync(sp);
-
-	mmu_convert_notrap(sp);
-	return 0;
 }
 
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
-	struct kvm_mmu_page *shadow;
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+	bool need_unsync = false;
+
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != gfn || s->role.direct || s->role.invalid)
+			continue;
 
-	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-	if (shadow) {
-		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
-		if (shadow->unsync)
-			return 0;
-		if (can_unsync && oos_shadow)
-			return kvm_unsync_page(vcpu, shadow);
-		return 1;
+
+		if (!need_unsync && !s->unsync) {
+			if (!can_unsync || !oos_shadow)
+				return 1;
+			need_unsync = true;
+		}
 	}
+	if (need_unsync)
+		kvm_unsync_pages(vcpu, gfn);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9f1a122f970dbef5ba3496587f39df5c1853083f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 24 May 2010 15:41:33 +0800
Subject: KVM: MMU: allow more page become unsync at getting sp time

Allow more page become asynchronous at getting sp time, if need create new
shadow page for gfn but it not allow unsync(level > 1), we should unsync all
gfn's unsync page

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba119dae890..07673487fd5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1220,6 +1220,35 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return __kvm_sync_page(vcpu, sp, true);
 }
 
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+	unsigned index;
+	bool flush = false;
+
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != gfn || !s->unsync || s->role.invalid)
+			continue;
+
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+			(vcpu->arch.mmu.sync_page(vcpu, s))) {
+			kvm_mmu_zap_page(vcpu->kvm, s);
+			continue;
+		}
+		kvm_unlink_unsync_page(vcpu->kvm, s);
+		flush = true;
+	}
+
+	if (flush)
+		kvm_mmu_flush_tlb(vcpu);
+}
+
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1318,8 +1347,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned index;
 	unsigned quadrant;
 	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp, *unsync_sp = NULL;
+	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
+	bool need_sync = false;
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
@@ -1336,17 +1366,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
 		if (sp->gfn == gfn) {
-			if (sp->unsync)
-				unsync_sp = sp;
+			if (!need_sync && sp->unsync)
+				need_sync = true;
 
 			if (sp->role.word != role.word)
 				continue;
 
-			if (!direct && unsync_sp &&
-			      kvm_sync_page_transient(vcpu, unsync_sp)) {
-				unsync_sp = NULL;
+			if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
 				break;
-			}
 
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
@@ -1358,9 +1385,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
 		}
-	if (!direct && unsync_sp)
-		kvm_sync_page(vcpu, unsync_sp);
-
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 	if (!sp)
@@ -1371,6 +1395,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
+		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+			kvm_sync_pages(vcpu, gfn);
+
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-- 
cgit v1.2.3


From c8174f7b35b3018c4c7b3237ed1c792e454fd5c3 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 24 May 2010 01:01:04 +0300
Subject: KVM: VMX: Add constant for invalid guest state exit reason

For the sake of completeness, this patch adds a symbolic
constant for VMX exit reason 0x21 (invalid guest state).

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2..104cf86a756 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_STATE	33
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
-- 
cgit v1.2.3


From 2032a93d66fa282ba0f2ea9152eeff9511fa9a96 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:49:59 +0800
Subject: KVM: MMU: Don't allocate gfns page for direct mmu pages

When sp->role.direct is set, sp->gfns does not contain any essential
information, leaf sptes reachable from this sp are for a continuous
guest physical memory range (a linear range).
So sp->gfns[i] (if it was set) equals to sp->gfn + i. (PT_PAGE_TABLE_LEVEL)
Obviously, it is not essential information, we can calculate it when need.

It means we don't need sp->gfns when sp->role.direct=1,
Thus we can save one page usage for every kvm_mmu_page.

Note:
  Access to sp->gfns must be wrapped by kvm_mmu_page_get_gfn()
  or kvm_mmu_page_set_gfn().
  It is only exposed in FNAME(sync_page).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 38 +++++++++++++++++++++++++++++---------
 arch/x86/kvm/paging_tmpl.h |  3 +++
 2 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07673487fd5..f46b6c9aff2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -397,6 +397,22 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 	kmem_cache_free(rmap_desc_cache, rd);
 }
 
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+	if (!sp->role.direct)
+		return sp->gfns[index];
+
+	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+	if (sp->role.direct)
+		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+	else
+		sp->gfns[index] = gfn;
+}
+
 /*
  * Return the pointer to the largepage write count for a given
  * gfn, handling slots that are not large page aligned.
@@ -547,7 +563,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		return count;
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
-	sp->gfns[spte - sp->spt] = gfn;
+	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -605,6 +621,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
 	pfn_t pfn;
+	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
@@ -616,7 +633,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(*spte))
 		kvm_set_pfn_dirty(pfn);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 		BUG();
@@ -900,7 +918,8 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	ASSERT(is_empty_shadow_page(sp->spt));
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
-	__free_page(virt_to_page(sp->gfns));
+	if (!sp->role.direct)
+		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
 	++kvm->arch.n_free_mmu_pages;
 }
@@ -911,13 +930,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte)
+					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
 
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	if (!direct)
+		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+						  PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -1386,7 +1407,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			return sp;
 		}
 	++vcpu->kvm->stat.mmu_cache_miss;
-	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
 	if (!sp)
 		return sp;
 	sp->gfn = gfn;
@@ -3403,7 +3424,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 
 	if (*sptep & PT_WRITABLE_MASK) {
 		rev_sp = page_header(__pa(sptep));
-		gfn = rev_sp->gfns[sptep - rev_sp->spt];
+		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
@@ -3417,8 +3438,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 			return;
 		}
 
-		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    rev_sp->role.level);
+		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 				return;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 167f53357ee..2ee7060a80a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -582,6 +582,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 	offset = nr_present = 0;
 
+	/* direct kvm_mmu_page can not be unsync. */
+	BUG_ON(sp->role.direct);
+
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
-- 
cgit v1.2.3


From c9fa0b3bef9a0b117b3c3f958ec553c21f609a9f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:48:25 +0800
Subject: KVM: MMU: Calculate correct base gfn for direct non-DIR level

In Document/kvm/mmu.txt:
  gfn:
    Either the guest page table containing the translations shadowed by this
    page, or the base page frame for linear translations. See role.direct.

But in __direct_map(), the base gfn calculation is incorrect,
it does not calculate correctly when level=3 or 4.

Fix by using PT64_LVL_ADDR_MASK() which accounts for all levels correctly.

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f46b6c9aff2..c0350be52c9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2020,7 +2020,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		}
 
 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
-			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			u64 base_addr = iterator.addr;
+
+			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+			pseudo_gfn = base_addr >> PAGE_SHIFT;
 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);
-- 
cgit v1.2.3


From 3af1817a0d65e8c1317e8d23cfe8a91aa1d4a065 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:48:19 +0800
Subject: KVM: MMU: calculate correct gfn for small host pages backing large
 guest pages

In Documentation/kvm/mmu.txt:
  gfn:
    Either the guest page table containing the translations shadowed by this
    page, or the base page frame for linear translations. See role.direct.

But in function FNAME(fetch)(), sp->gfn is incorrect when one of following
situations occurred:

 1) guest is 32bit paging and the guest PDE maps a 4-MByte page
    (backed by 4k host pages), FNAME(fetch)() miss handling the quadrant.

    And if guest use pse-36, "table_gfn = gpte_to_gfn(gw->ptes[level - delta]);"
    is incorrect.

 2) guest is long mode paging and the guest PDPTE maps a 1-GByte page
    (backed by 4k or 2M host pages).

So we fix it to suit to the document and suit to the code which
requires sp->gfn correct when sp->role.direct=1.

We use the goal mapping gfn(gw->gfn) to calculate the base page frame
for linear translations, it is simple and easy to be understood.

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Reported-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2ee7060a80a..1f7f5dd8306 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -339,10 +339,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 1;
 			if (!is_dirty_gpte(gw->ptes[level - delta]))
 				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
-			/* advance table_gfn when emulating 1gb pages with 4k */
-			if (delta == 0)
-				table_gfn += PT_INDEX(addr, level);
+			/*
+			 * It is a large guest pages backed by small host pages,
+			 * So we set @direct(@shadow_page->role.direct)=1, and
+			 * set @table_gfn(@shadow_page->gfn)=the base page frame
+			 * for linear translations.
+			 */
+			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 			access &= gw->pte_access;
 		} else {
 			direct = 0;
-- 
cgit v1.2.3


From 01c168ac3d6568fed0373d82bd2db2b9339aab16 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 27 May 2010 16:09:48 +0800
Subject: KVM: MMU: don't check PT_WRITABLE_MASK directly

Since we have is_writable_pte(), make use of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c0350be52c9..9f4be0114bc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2990,7 +2990,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		pt = sp->spt;
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 			/* avoid RMW */
-			if (pt[i] & PT_WRITABLE_MASK)
+			if (is_writable_pte(pt[i]))
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
 	kvm_flush_remote_tlbs(kvm);
@@ -3425,7 +3425,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-	if (*sptep & PT_WRITABLE_MASK) {
+	if (is_writable_pte(*sptep)) {
 		rev_sp = page_header(__pa(sptep));
 		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
@@ -3474,7 +3474,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 			if (!(ent & PT_PRESENT_MASK))
 				continue;
-			if (!(ent & PT_WRITABLE_MASK))
+			if (!is_writable_pte(ent))
 				continue;
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
@@ -3508,7 +3508,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
-			if (*spte & PT_WRITABLE_MASK)
+			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %lx role %x\n",
 			       __func__, audit_msg, sp->gfn,
-- 
cgit v1.2.3


From 6dc696d4ddf2181eefee361e1d24a49351aef1f6 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Wed, 26 May 2010 15:09:43 -1000
Subject: KVM: SVM: Fix EFER.LME being stripped

Must set VCPU register to be the guest notion of EFER even if that
setting is not valid on hardware.  This was masked by the set in
set_efer until 7657fd5ace88e8092f5f3a84117e093d7b893f26 broke that.
Fix is simply to set the VCPU register before stripping bits.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 02ea5cf4b1e..9c68a650f57 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -286,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+	vcpu->arch.efer = efer;
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
-	vcpu->arch.efer = efer;
 }
 
 static int is_external_interrupt(u32 info)
-- 
cgit v1.2.3


From 10ab25cd6bf7ee4e5a55d81f203f7dc1a855c27e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 25 May 2010 16:01:50 +0200
Subject: KVM: x86: Propagate fpu_alloc errors

Memory allocation may fail. Propagate such errors.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  7 ++++++-
 arch/x86/kvm/vmx.c              |  4 +++-
 arch/x86/kvm/x86.c              | 11 +++++++++--
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d08bb4a202d..0cd0f2923af 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -624,7 +624,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void fx_init(struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c68a650f57..2ae0c392329 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -904,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
-	fx_init(&svm->vcpu);
+	err = fx_init(&svm->vcpu);
+	if (err)
+		goto free_page4;
+
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 	return &svm->vcpu;
 
+free_page4:
+	__free_page(hsave_page);
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9c3ffc5fde4..e71c731433e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2659,7 +2659,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		msr |= MSR_IA32_APICBASE_BSP;
 	kvm_set_apic_base(&vmx->vcpu, msr);
 
-	fx_init(&vmx->vcpu);
+	ret = fx_init(&vmx->vcpu);
+	if (ret != 0)
+		goto out;
 
 	seg_setup(VCPU_SREG_CS);
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 033b9c207f9..e6e0d7781af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5114,12 +5114,19 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
 {
-	fpu_alloc(&vcpu->arch.guest_fpu);
+	int err;
+
+	err = fpu_alloc(&vcpu->arch.guest_fpu);
+	if (err)
+		return err;
+
 	fpu_finit(&vcpu->arch.guest_fpu);
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
-- 
cgit v1.2.3


From 8184dd38e22fcaec664c2b98c382b85c26780e26 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 May 2010 14:22:51 +0300
Subject: KVM: MMU: Allow spte.w=1 for gpte.w=0 and cr0.wp=0 only in shadow
 mode

When tdp is enabled, the guest's cr0.wp shouldn't have any effect on spte
permissions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f4be0114bc..69d40a6e1e6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1882,7 +1882,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+		&& !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
-- 
cgit v1.2.3


From b66d80006e415ee083e59c9429911eab78047f8f Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 31 May 2010 17:11:39 +0800
Subject: KVM: MMU: Don't calculate quadrant if tdp_enabled

There's no need to calculate quadrant if tdp is enabled.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 69d40a6e1e6..d3cd102aee2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
-- 
cgit v1.2.3


From 5120702e732ed72c7055f511f8dd01de36424569 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 31 May 2010 22:40:54 +0300
Subject: KVM: VMX: Properly return error to userspace on vmentry failure

The vmexit handler returns KVM_EXIT_UNKNOWN since there is no handler
for vmentry failures. This intercepts vmentry failures and returns
KVM_FAIL_ENTRY to userspace instead.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e71c731433e..8c9085d44c3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3665,6 +3665,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (enable_ept && is_paging(vcpu))
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 
+	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		vcpu->run->fail_entry.hardware_entry_failure_reason
+			= exit_reason;
+		return 0;
+	}
+
 	if (unlikely(vmx->fail)) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
-- 
cgit v1.2.3


From 4bc9b9828150747386130ab172f7e868e1a0fc2a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 2 Jun 2010 14:05:24 +0800
Subject: KVM: VMX: Enforce EPT pagetable level checking

We only support 4 levels EPT pagetable now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c9085d44c3..2201e381620 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -340,6 +340,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
 	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
 	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -1568,7 +1573,8 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
-	if (!cpu_has_vmx_ept()) {
+	if (!cpu_has_vmx_ept() ||
+	    !cpu_has_vmx_ept_4levels()) {
 		enable_ept = 0;
 		enable_unrestricted_guest = 0;
 	}
-- 
cgit v1.2.3


From 7bee342a9e994cce7122cb187b4f3ded9d871165 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 2 Jun 2010 17:06:03 +0800
Subject: KVM: x86: use linux/uaccess.h instead of asm/uaccess.h

Should use linux/uaccess.h instead of asm/uaccess.h

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6e0d7781af..b08c0052e33 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,13 +42,13 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
 #include <asm/debugreg.h>
-#include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
-- 
cgit v1.2.3


From 518c8aee5ca74fc03273fc6b4893cf456d65d545 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 08:51:39 +0800
Subject: KVM: VMX: Make sure single type invvpid is supported before issuing
 invvpid instruction

According to SDM, we need check whether single-context INVVPID type is supported
before issuing invvpid instruction.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h | 2 ++
 arch/x86/kvm/vmx.c         | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 104cf86a756..b4e28400c9f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,6 +376,8 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
 #define VMX_EPT_MT_EPTE_SHIFT			3
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2201e381620..94526536188 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -360,6 +360,11 @@ static inline bool cpu_has_vmx_invept_global(void)
 	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -504,7 +509,8 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	if (vmx->vpid == 0)
 		return;
 
-	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+	if (cpu_has_vmx_invvpid_single())
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
 static inline void ept_sync_global(void)
-- 
cgit v1.2.3


From 03116aa57e75b1bbe8b5e04f3cd21cdb6588c4ba Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:52:17 +0800
Subject: KVM: MMU: skip invalid sp when unprotect page

In kvm_mmu_unprotect_page(), the invalid sp can be skipped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3cd102aee2..3ac51153bc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1629,7 +1629,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct) {
+		if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
-- 
cgit v1.2.3


From 7ae680eb2d5f0cb10ca0e6d1ff5ecb145befe8e4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:53:07 +0800
Subject: KVM: MMU: introduce some macros to cleanup hlist traverseing

Introduce for_each_gfn_sp() and for_each_gfn_indirect_valid_sp() to
cleanup hlist traverseing

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 122 +++++++++++++++++++++--------------------------------
 1 file changed, 47 insertions(+), 75 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ac51153bc4..881ad918455 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1201,6 +1201,17 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+#define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
+  hlist_for_each_entry_safe(sp, pos, n,					\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+	if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)		\
+  hlist_for_each_entry_safe(sp, pos, n,					\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
+			(sp)->role.invalid) {} else
+
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   bool clear_unsync)
 {
@@ -1244,16 +1255,12 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 /* @gfn should be write-protected at the call site */
 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
-	unsigned index;
 	bool flush = false;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || !s->unsync || s->role.invalid)
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+		if (!s->unsync)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
@@ -1365,9 +1372,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     u64 *parent_pte)
 {
 	union kvm_mmu_page_role role;
-	unsigned index;
 	unsigned quadrant;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
 	bool need_sync = false;
@@ -1383,36 +1388,34 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
-		if (sp->gfn == gfn) {
-			if (!need_sync && sp->unsync)
-				need_sync = true;
+	for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) {
+		if (!need_sync && sp->unsync)
+			need_sync = true;
 
-			if (sp->role.word != role.word)
-				continue;
+		if (sp->role.word != role.word)
+			continue;
 
-			if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-				break;
+		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+			break;
 
-			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-			if (sp->unsync_children) {
-				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(sp);
-			} else if (sp->unsync)
-				kvm_mmu_mark_parents_unsync(sp);
+		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+		if (sp->unsync_children) {
+			set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+			kvm_mmu_mark_parents_unsync(sp);
+		} else if (sp->unsync)
+			kvm_mmu_mark_parents_unsync(sp);
 
-			trace_kvm_mmu_get_page(sp, false);
-			return sp;
-		}
+		trace_kvm_mmu_get_page(sp, false);
+		return sp;
+	}
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
 	if (!sp)
 		return sp;
 	sp->gfn = gfn;
 	sp->role = role;
-	hlist_add_head(&sp->hash_link, bucket);
+	hlist_add_head(&sp->hash_link,
+		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1617,46 +1620,34 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) {
-			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
-				 sp->role.word);
-			r = 1;
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) {
+		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+			 sp->role.word);
+		r = 1;
+		if (kvm_mmu_zap_page(kvm, sp))
+			goto restart;
+	}
 	return r;
 }
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *nn;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
-	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: zap %lx %x\n",
-				 __func__, gfn, sp->role.word);
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
+		pgprintk("%s: zap %lx %x\n",
+			 __func__, gfn, sp->role.word);
+		if (kvm_mmu_zap_page(kvm, sp))
+			goto restart;
 	}
 }
 
@@ -1799,17 +1790,11 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
-	unsigned index;
-
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || s->role.direct || s->unsync ||
-		      s->role.invalid)
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+		if (s->unsync)
 			continue;
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		__kvm_unsync_page(vcpu, s);
@@ -1819,18 +1804,11 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
 	bool need_unsync = false;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || s->role.direct || s->role.invalid)
-			continue;
-
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
@@ -2703,8 +2681,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
-	struct hlist_head *bucket;
-	unsigned index;
 	u64 entry, gentry;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
@@ -2772,13 +2748,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			vcpu->arch.last_pte_updated = NULL;
 		}
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
 restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
-			continue;
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
-- 
cgit v1.2.3


From 7775834a233478ec855b97e30727248f12eafe76 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:53:54 +0800
Subject: KVM: MMU: split the operations of kvm_mmu_zap_page()

Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to
split kvm_mmu_zap_page() function, then we can:

- traverse hlist safely
- easily to gather remote tlb flush which occurs during page zapped

Those feature can be used in the later patches

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c      | 52 ++++++++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmutrace.h |  2 +-
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 881ad918455..9b849a70742 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -916,6 +916,7 @@ static int is_empty_shadow_page(u64 *spt)
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
+	hlist_del(&sp->hash_link);
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
 	if (!sp->role.direct)
@@ -1200,6 +1201,10 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list);
 
 #define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
   hlist_for_each_entry_safe(sp, pos, n,					\
@@ -1530,7 +1535,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
-				   struct kvm_mmu_page *parent)
+				   struct kvm_mmu_page *parent,
+				   struct list_head *invalid_list)
 {
 	int i, zapped = 0;
 	struct mmu_page_path parents;
@@ -1544,7 +1550,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 		struct kvm_mmu_page *sp;
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_mmu_zap_page(kvm, sp);
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 			mmu_pages_clear_parents(&parents);
 			zapped++;
 		}
@@ -1554,16 +1560,16 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	return zapped;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list)
 {
 	int ret;
 
-	trace_kvm_mmu_zap_page(sp);
+	trace_kvm_mmu_prepare_zap_page(sp);
 	++kvm->stat.mmu_shadow_zapped;
-	ret = mmu_zap_unsync_children(kvm, sp);
+	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
-	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
@@ -1571,17 +1577,45 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->root_count) {
 		/* Count self */
 		ret++;
-		hlist_del(&sp->hash_link);
-		kvm_mmu_free_page(kvm, sp);
+		list_move(&sp->link, invalid_list);
 	} else {
-		sp->role.invalid = 1;
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
 	}
+
+	sp->role.invalid = 1;
 	kvm_mmu_reset_last_pte_updated(kvm);
 	return ret;
 }
 
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	if (list_empty(invalid_list))
+		return;
+
+	kvm_flush_remote_tlbs(kvm);
+
+	do {
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		WARN_ON(!sp->role.invalid || sp->root_count);
+		kvm_mmu_free_page(kvm, sp);
+	} while (!list_empty(invalid_list));
+
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	LIST_HEAD(invalid_list);
+	int ret;
+
+	ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	return ret;
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc..3aab0f0930e 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_ARGS(sp)
 );
 
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
 
 	TP_ARGS(sp)
-- 
cgit v1.2.3


From 103ad25a86a6ec5418b3dca6a0d2bf2ba01a8318 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:54:38 +0800
Subject: KVM: MMU: don't get free page number in the loop

In the later patch, we will modify sp's zapping way like below:

	kvm_mmu_prepare_zap_page A
	kvm_mmu_prepare_zap_page B
	kvm_mmu_prepare_zap_page C
	....
	kvm_mmu_commit_zap_page

[ zaped multiple sps only need to call kvm_mmu_commit_zap_page once ]

In __kvm_mmu_free_some_pages() function, the free page number is
getted form 'vcpu->kvm->arch.n_free_mmu_pages' in loop, it will
hinders us to apply kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page()
since kvm_mmu_prepare_zap_page() not free sp.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b849a70742..1aad8e713f7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2863,13 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+	int free_pages;
+
+	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+	while (free_pages < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		free_pages += kvm_mmu_zap_page(vcpu->kvm, sp);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 }
-- 
cgit v1.2.3


From d98ba053656c033180781007241f2c9d54606d56 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:55:29 +0800
Subject: KVM: MMU: gather remote tlb flush which occurs during page zapped

Using kvm_mmu_prepare_zap_page() and kvm_mmu_zap_page() instead of
kvm_mmu_zap_page() that can reduce remote tlb flush IPI

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1aad8e713f7..44548e34697 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1200,7 +1200,6 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	--kvm->stat.mmu_unsync;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 				    struct list_head *invalid_list);
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -1218,10 +1217,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 			(sp)->role.invalid) {} else
 
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			   bool clear_unsync)
+			   struct list_head *invalid_list, bool clear_unsync)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
 
@@ -1232,7 +1231,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	}
 
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
 
@@ -1244,17 +1243,22 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp);
 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu_page *sp)
 {
+	LIST_HEAD(invalid_list);
 	int ret;
 
-	ret = __kvm_sync_page(vcpu, sp, false);
+	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
 	if (!ret)
 		mmu_convert_notrap(sp);
+	else
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
 	return ret;
 }
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			 struct list_head *invalid_list)
 {
-	return __kvm_sync_page(vcpu, sp, true);
+	return __kvm_sync_page(vcpu, sp, invalid_list, true);
 }
 
 /* @gfn should be write-protected at the call site */
@@ -1262,6 +1266,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	bool flush = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
@@ -1271,13 +1276,14 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
 			(vcpu->arch.mmu.sync_page(vcpu, s))) {
-			kvm_mmu_zap_page(vcpu->kvm, s);
+			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
 			continue;
 		}
 		kvm_unlink_unsync_page(vcpu->kvm, s);
 		flush = true;
 	}
 
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	if (flush)
 		kvm_mmu_flush_tlb(vcpu);
 }
@@ -1348,6 +1354,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
+	LIST_HEAD(invalid_list);
 
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
@@ -1360,9 +1367,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_sync_page(vcpu, sp);
+			kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
@@ -1606,16 +1614,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	LIST_HEAD(invalid_list);
-	int ret;
-
-	ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	return ret;
-}
-
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1623,6 +1621,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
 	int used_pages;
+	LIST_HEAD(invalid_list);
 
 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
 	used_pages = max(0, used_pages);
@@ -1640,8 +1639,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_zap_page(kvm, page);
+			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+							       &invalid_list);
 		}
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		kvm_nr_mmu_pages = used_pages;
 		kvm->arch.n_free_mmu_pages = 0;
 	}
@@ -1656,6 +1657,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
@@ -1665,9 +1667,10 @@ restart:
 		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	return r;
 }
 
@@ -1675,14 +1678,16 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *nn;
+	LIST_HEAD(invalid_list);
 
 restart:
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
 		pgprintk("%s: zap %lx %x\n",
 			 __func__, gfn, sp->role.word);
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -2123,6 +2128,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	struct kvm_mmu_page *sp;
+	LIST_HEAD(invalid_list);
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
@@ -2132,8 +2138,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 
 		sp = page_header(root);
 		--sp->root_count;
-		if (!sp->root_count && sp->role.invalid)
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+		if (!sp->root_count && sp->role.invalid) {
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+		}
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
@@ -2146,10 +2154,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 			sp = page_header(root);
 			--sp->root_count;
 			if (!sp->root_count && sp->role.invalid)
-				kvm_mmu_zap_page(vcpu->kvm, sp);
+				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+							 &invalid_list);
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
@@ -2715,6 +2725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	u64 entry, gentry;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
@@ -2801,7 +2812,8 @@ restart:
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			if (kvm_mmu_zap_page(vcpu->kvm, sp))
+			if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						     &invalid_list))
 				goto restart;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
@@ -2836,6 +2848,7 @@ restart:
 			++spte;
 		}
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2864,6 +2877,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 	int free_pages;
+	LIST_HEAD(invalid_list);
 
 	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
 	while (free_pages < KVM_REFILL_PAGES &&
@@ -2872,9 +2886,11 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_zap_page(vcpu->kvm, sp);
+		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						       &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3009,25 +3025,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
 
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
 }
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+					       struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_zap_page(kvm, page);
+	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -3040,6 +3059,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		int npages, idx, freed_pages;
+		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
@@ -3047,12 +3067,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+							  &invalid_list);
 			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
 
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 	}
-- 
cgit v1.2.3


From f41d335a02d5132c14ec0459d3b2790eeb16fb11 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:56:11 +0800
Subject: KVM: MMU: traverse sp hlish safely

Now, we can safely to traverse sp hlish

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 51 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 44548e34697..3b75689eda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1205,13 +1205,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
-  hlist_for_each_entry_safe(sp, pos, n,					\
+#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
+  hlist_for_each_entry(sp, pos,						\
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
 	if ((sp)->gfn != (gfn)) {} else
 
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)		\
-  hlist_for_each_entry_safe(sp, pos, n,					\
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
+  hlist_for_each_entry(sp, pos,						\
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
 		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
 			(sp)->role.invalid) {} else
@@ -1265,11 +1265,11 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	bool flush = false;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (!s->unsync)
 			continue;
 
@@ -1387,7 +1387,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	union kvm_mmu_page_role role;
 	unsigned quadrant;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	bool need_sync = false;
 
 	role = vcpu->arch.mmu.base_role;
@@ -1401,7 +1401,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) {
+	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -1656,19 +1656,18 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
-restart:
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) {
+
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	return r;
@@ -1677,15 +1676,13 @@ restart:
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *nn;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 
-restart:
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: zap %lx %x\n",
 			 __func__, gfn, sp->role.word);
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
@@ -1830,9 +1827,9 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (s->unsync)
 			continue;
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
@@ -1844,10 +1841,10 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	bool need_unsync = false;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
@@ -2724,7 +2721,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry;
 	u64 *spte;
@@ -2794,8 +2791,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 
-restart:
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
@@ -2812,9 +2808,8 @@ restart:
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						     &invalid_list))
-				goto restart;
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
-- 
cgit v1.2.3


From 0671a8e75d8aeb33e15c5152147abb0d2fa0c1e6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:56:59 +0800
Subject: KVM: MMU: reduce remote tlb flush in kvm_mmu_pte_write()

collect remote tlb flush in kvm_mmu_pte_write() path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b75689eda9..b285449e82b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2666,11 +2666,15 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+				    bool remote_flush, bool local_flush)
 {
-	if (need_remote_flush(old, new))
+	if (zap_page)
+		return;
+
+	if (remote_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
-	else
+	else if (local_flush)
 		kvm_mmu_flush_tlb(vcpu);
 }
 
@@ -2735,6 +2739,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int r;
 	int invlpg_counter;
+	bool remote_flush, local_flush, zap_page;
+
+	zap_page = remote_flush = local_flush = false;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -2808,7 +2815,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
@@ -2833,16 +2840,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			if (quadrant != sp->role.quadrant)
 				continue;
 		}
+		local_flush = true;
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
 			if (gentry)
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+			if (!remote_flush && need_remote_flush(entry, *spte))
+				remote_flush = true;
 			++spte;
 		}
 	}
+	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3


From b9d762fa79f541ab480cdb733b46fdb0b4471c2d Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Jun 2010 10:32:29 +0800
Subject: KVM: VMX: Add all-context INVVPID type support

Add all-context INVVPID type support.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c         | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b4e28400c9f..96a5886d384 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -377,6 +377,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 94526536188..622d83b0caf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -365,6 +365,11 @@ static inline bool cpu_has_vmx_invvpid_single(void)
 	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -513,6 +518,20 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void vpid_sync_vcpu_global(void)
+{
+	if (cpu_has_vmx_invvpid_global())
+		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+	if (cpu_has_vmx_invvpid_single())
+		vpid_sync_vcpu_all(vmx);
+	else
+		vpid_sync_vcpu_global();
+}
+
 static inline void ept_sync_global(void)
 {
 	if (cpu_has_vmx_invept_global())
@@ -1800,7 +1819,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	vpid_sync_vcpu_all(to_vmx(vcpu));
+	vpid_sync_context(to_vmx(vcpu));
 	if (enable_ept)
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
 }
@@ -2756,7 +2775,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_fpu_activate(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 
-	vpid_sync_vcpu_all(vmx);
+	vpid_sync_context(vmx);
 
 	ret = 0;
 
-- 
cgit v1.2.3


From 1760dd4939a62591e492971858fac8cce1e4539e Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Jun 2010 10:33:27 +0800
Subject: KVM: VMX: rename vpid_sync_vcpu_all() to vpid_sync_vcpu_single()

The name "pid_sync_vcpu_all" isn't appropriate since it just affect
a single vpid, so rename it to vpid_sync_vcpu_single().

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 622d83b0caf..7d7361750bf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -509,7 +509,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 }
 
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
 {
 	if (vmx->vpid == 0)
 		return;
@@ -527,7 +527,7 @@ static inline void vpid_sync_vcpu_global(void)
 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
 {
 	if (cpu_has_vmx_invvpid_single())
-		vpid_sync_vcpu_all(vmx);
+		vpid_sync_vcpu_single(vmx);
 	else
 		vpid_sync_vcpu_global();
 }
-- 
cgit v1.2.3


From 4b9d3a04519fb508ad3b7ce8a7962929b2614185 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 10:15:51 +0800
Subject: KVM: VMX: fix rcu usage warning in init_rmode()

fix:

[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
include/linux/kvm_host.h:258 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 1
1 lock held by qemu-system-x86/3796:
 #0:  (&vcpu->mutex){+.+.+.}, at: [<ffffffffa0217fd8>] vcpu_load+0x1a/0x66 [kvm]

stack backtrace:
Pid: 3796, comm: qemu-system-x86 Not tainted 2.6.34 #25
Call Trace:
 [<ffffffff81070ed1>] lockdep_rcu_dereference+0x9d/0xa5
 [<ffffffffa0214fdf>] gfn_to_memslot_unaliased+0x65/0xa0 [kvm]
 [<ffffffffa0216139>] gfn_to_hva+0x22/0x4c [kvm]
 [<ffffffffa0216217>] kvm_write_guest_page+0x2a/0x7f [kvm]
 [<ffffffffa0216286>] kvm_clear_guest_page+0x1a/0x1c [kvm]
 [<ffffffffa0278239>] init_rmode+0x3b/0x180 [kvm_intel]
 [<ffffffffa02786ce>] vmx_set_cr0+0x350/0x4d3 [kvm_intel]
 [<ffffffffa02274ff>] kvm_arch_vcpu_ioctl_set_sregs+0x122/0x31a [kvm]
 [<ffffffffa021859c>] kvm_vcpu_ioctl+0x578/0xa3d [kvm]
 [<ffffffff8106624c>] ? cpu_clock+0x2d/0x40
 [<ffffffff810f7d86>] ? fget_light+0x244/0x28e
 [<ffffffff810709b9>] ? trace_hardirqs_off_caller+0x1f/0x10e
 [<ffffffff8110501b>] vfs_ioctl+0x32/0xa6
 [<ffffffff81105597>] do_vfs_ioctl+0x47f/0x4b8
 [<ffffffff813ae654>] ? sub_preempt_count+0xa3/0xb7
 [<ffffffff810f7da8>] ? fget_light+0x266/0x28e
 [<ffffffff810f7c53>] ? fget_light+0x111/0x28e
 [<ffffffff81105617>] sys_ioctl+0x47/0x6a
 [<ffffffff81002c1b>] system_call_fastpath+0x16/0x1b

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d7361750bf..01b054c9813 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2659,21 +2659,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 static int init_rmode(struct kvm *kvm)
 {
+	int idx, ret = 0;
+
+	idx = srcu_read_lock(&kvm->srcu);
 	if (!init_rmode_tss(kvm))
-		return 0;
+		goto exit;
 	if (!init_rmode_identity_map(kvm))
-		return 0;
-	return 1;
+		goto exit;
+
+	ret = 1;
+exit:
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
 }
 
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 msr;
-	int ret, idx;
+	int ret;
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
@@ -2783,7 +2789,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->emulation_required = 0;
 
 out:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4f78fd08e91c52f097d64a42d903b76fe52a3a0f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:05:05 +0800
Subject: KVM: MMU: remove unnecessary remote tlb flush

This remote tlb flush is no necessary since we have synced while
sp is zapped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b285449e82b..098a0b8616b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3040,8 +3040,6 @@ restart:
 
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
-
-	kvm_flush_remote_tlbs(kvm);
 }
 
 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-- 
cgit v1.2.3


From 5304efde6ae27deeeae79b97af709d4ceecc336e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:05:57 +0800
Subject: KVM: MMU: use wrapper function to flush local tlb

Use kvm_mmu_flush_tlb() function instead of calling
kvm_x86_ops->tlb_flush(vcpu) directly.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 098a0b8616b..e087f855461 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1984,7 +1984,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		      reset_host_protection)) {
 		if (write_fault)
 			*ptwrite = 1;
-		kvm_x86_ops->tlb_flush(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-- 
cgit v1.2.3


From 3b5d13218667b3ca52efa52cec1d322163bf5465 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:07:01 +0800
Subject: KVM: MMU: delay local tlb flush

delay local tlb flush until enter guest moden, it can reduce vpid flush
frequency and reduce remote tlb flush IPI(if KVM_REQ_TLB_FLUSH bit is
already set, IPI is not sent)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e087f855461..4706a936e36 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2364,7 +2364,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
+	set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From a24e809902339458416900869abdcc51a44bfd48 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 10 Jun 2010 13:10:55 +0200
Subject: KVM: Fix unused but set warnings

No real bugs in this one.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1f7f5dd8306..9308be2d5c0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -444,6 +444,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
+	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
 
-- 
cgit v1.2.3


From f495c6e5e8fdc972162241df5bdff5bcebb4dc33 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:21:29 +0300
Subject: KVM: VMX: Fix incorrect rcu deref in rmode_tss_base()

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 01b054c9813..26ba61d6af8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1688,7 +1688,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		gfn_t base_gfn;
 
 		slots = kvm_memslots(kvm);
-		base_gfn = kvm->memslots->memslots[0].base_gfn +
+		base_gfn = slots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
 	}
-- 
cgit v1.2.3


From 2acf923e38fb6a4ce0c57115decbb38d334902ac Mon Sep 17 00:00:00 2001
From: Dexuan Cui <dexuan.cui@intel.com>
Date: Thu, 10 Jun 2010 11:27:12 +0800
Subject: KVM: VMX: Enable XSAVE/XRSTOR for guest

This patch enable guest to use XSAVE/XRSTOR instructions.

We assume that host_xcr0 would use all possible bits that OS supported.

And we loaded xcr0 in the same way we handled fpu - do it as late as we can.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/include/asm/vmx.h      |   1 +
 arch/x86/kvm/kvm_cache_regs.h   |   6 ++
 arch/x86/kvm/vmx.c              |  13 ++++
 arch/x86/kvm/x86.c              | 130 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 145 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0cd0f2923af..91631b8b209 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -302,6 +302,7 @@ struct kvm_vcpu_arch {
 	} update_pte;
 
 	struct fpu guest_fpu;
+	u64 xcr0;
 
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
@@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 96a5886d384..9f0cbd987d5 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -267,6 +267,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
 
 /*
  * Interruption-information format
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index d2a98f8f9af..6491ac8e755 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 	return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+	return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 26ba61d6af8..864a1b6d155 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -37,6 +37,8 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
 
 #include "trace.h"
 
@@ -3390,6 +3392,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+	u64 new_bv = kvm_read_edx_eax(vcpu);
+	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+		skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
 	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
@@ -3668,6 +3680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b08c0052e33..b5e644701cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -65,6 +65,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXSAVE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -150,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
@@ -474,6 +482,61 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	u64 xcr0;
+
+	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+	if (index != XCR_XFEATURE_ENABLED_MASK)
+		return 1;
+	xcr0 = xcr;
+	if (kvm_x86_ops->get_cpl(vcpu) != 0)
+		return 1;
+	if (!(xcr0 & XSTATE_FP))
+		return 1;
+	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+		return 1;
+	if (xcr0 & ~host_xcr0)
+		return 1;
+	vcpu->arch.xcr0 = xcr0;
+	vcpu->guest_xcr0_loaded = 0;
+	return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	if (__kvm_set_xcr(vcpu, index, xcr)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
+		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
+	}
+}
+
 int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -482,6 +545,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
+	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -498,6 +564,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if ((cr4 ^ old_cr4) & pdptr_bits)
 		kvm_mmu_reset_context(vcpu);
 
+	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+		update_cpuid(vcpu);
+
 	return 0;
 }
 
@@ -666,11 +735,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1814,6 +1878,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1837,6 +1902,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 	return 0;
 
 out:
@@ -1917,7 +1983,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, XSAVE, OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */;
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1932,7 +1998,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	switch (function) {
 	case 0:
-		entry->eax = min(entry->eax, (u32)0xb);
+		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
@@ -1990,6 +2056,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 0xd: {
+		int i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (i = 1; *nent < maxnent; ++i) {
+			if (entry[i - 1].eax == 0 && i != 2)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
@@ -4125,6 +4205,9 @@ int kvm_arch_init(void *opaque)
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
+	if (cpu_has_xsave)
+		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
 	return 0;
 
 out:
@@ -4523,6 +4606,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+			!vcpu->guest_xcr0_loaded) {
+		/* kvm_set_xcr() also depends on this */
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+		vcpu->guest_xcr0_loaded = 1;
+	}
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_xcr0_loaded) {
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+		vcpu->guest_xcr0_loaded = 0;
+	}
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -4568,6 +4670,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_xcr0(vcpu);
 
 	atomic_set(&vcpu->guest_mode, 1);
 	smp_wmb();
@@ -5124,6 +5227,11 @@ int fx_init(struct kvm_vcpu *vcpu)
 
 	fpu_finit(&vcpu->arch.guest_fpu);
 
+	/*
+	 * Ensure guest xcr0 is valid for loading
+	 */
+	vcpu->arch.xcr0 = XSTATE_FP;
+
 	vcpu->arch.cr0 |= X86_CR0_ET;
 
 	return 0;
@@ -5140,6 +5248,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_fpu_loaded)
 		return;
 
+	/*
+	 * Restore all possible states in the guest,
+	 * and assume host would use all available bits.
+	 * Guest xcr0 would be loaded later.
+	 */
+	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	unlazy_fpu(current);
 	fpu_restore_checking(&vcpu->arch.guest_fpu);
@@ -5148,6 +5262,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
+	kvm_put_guest_xcr0(vcpu);
+
 	if (!vcpu->guest_fpu_loaded)
 		return;
 
-- 
cgit v1.2.3


From 49a9b07edcf4aff159c1f3d3a27e58cf38bc27cd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:14 +0300
Subject: KVM: Fix mov cr0 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  2 +-
 arch/x86/kvm/vmx.c              | 13 +++++++++++--
 arch/x86/kvm/x86.c              | 12 +++---------
 4 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 91631b8b209..b2370845021 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -597,7 +597,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code);
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ae0c392329..6d1616d47c5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -807,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 864a1b6d155..1baf4b2d98e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3157,11 +3157,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+	if (err)
+		kvm_inject_gp(vcpu, 0);
+	else
+		skip_emulated_instruction(vcpu);
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
 	int cr;
 	int reg;
+	int err;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	cr = exit_qualification & 15;
@@ -3172,8 +3181,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			kvm_set_cr0(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr0(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
 			kvm_set_cr3(vcpu, val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5e644701cc..05e9b5dde64 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -425,7 +425,7 @@ out:
 	return changed;
 }
 
-static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
 	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
@@ -468,17 +468,11 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
-
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	if (__kvm_set_cr0(vcpu, cr0))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -3732,7 +3726,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 
 	switch (cr) {
 	case 0:
-		res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
-- 
cgit v1.2.3


From a83b29c6ad6d6497e569edbc29e556a384cebddd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:15 +0300
Subject: KVM: Fix mov cr4 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/vmx.c              |  4 ++--
 arch/x86/kvm/x86.c              | 10 ++--------
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2370845021..ea8c319cdff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -599,7 +599,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1baf4b2d98e..f64d65dc38c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3189,8 +3189,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
-			kvm_set_cr4(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr4(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05e9b5dde64..ed3af15d440 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -531,7 +531,7 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 	}
 }
 
-int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
@@ -563,12 +563,6 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	return 0;
 }
-
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	if (__kvm_set_cr4(vcpu, cr4))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
@@ -3735,7 +3729,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 		res = __kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
-		res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
 		res = __kvm_set_cr8(vcpu, val & 0xfUL);
-- 
cgit v1.2.3


From 2390218b6aa2eb3784b0a82fa811c19097dc793a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:16 +0300
Subject: KVM: Fix mov cr3 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              |  2 +-
 arch/x86/kvm/svm.c              |  4 ++--
 arch/x86/kvm/vmx.c              |  4 ++--
 arch/x86/kvm/x86.c              | 10 ++--------
 5 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea8c319cdff..c2813d658f3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,7 +598,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code);
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4706a936e36..aa98fca03ed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3203,7 +3203,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	kvm_set_cr3(vcpu, vcpu->arch.cr3);
+	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
 	return 1;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6d1616d47c5..f7a6fdcf8ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1963,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = hsave->save.cr3;
 		svm->vcpu.arch.cr3 = hsave->save.cr3;
 	} else {
-		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
 	}
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2086,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
 	} else
-		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
 
 	/* Guest paging mode is active - reset mmu */
 	kvm_mmu_reset_context(&svm->vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f64d65dc38c..345a3547051 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3185,8 +3185,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
-			kvm_set_cr3(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr3(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 4:
 			err = kvm_set_cr4(vcpu, val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed3af15d440..795999e1ac1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -565,7 +565,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
@@ -604,12 +604,6 @@ static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	vcpu->arch.mmu.new_cr3(vcpu);
 	return 0;
 }
-
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	if (__kvm_set_cr3(vcpu, cr3))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
 int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
@@ -3726,7 +3720,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
-		res = __kvm_set_cr3(vcpu, val);
+		res = kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-- 
cgit v1.2.3


From 2d5b5a665508c60577c1088e0405850a965b6795 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Sun, 13 Jun 2010 17:29:39 +0800
Subject: KVM: x86: XSAVE/XRSTOR live migration support

This patch enable save/restore of xsave state.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm.h   |  22 +++++++
 arch/x86/include/asm/xsave.h |   7 ++-
 arch/x86/kvm/x86.c           | 139 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0..4d8dcbdfc12 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+#define KVM_MAX_XCRS	16
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 29ee4e4c64c..32c36668fa7 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,8 +13,11 @@
 
 #define FXSAVE_SIZE	512
 
-#define XSTATE_YMM_SIZE 256
-#define XSTATE_YMM_OFFSET (512 + 64)
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /*
  * These are the features that the OS can handle currently.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 795999e1ac1..0c8dc9614e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1680,6 +1680,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
+	case KVM_CAP_XSAVE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1703,6 +1704,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
+	case KVM_CAP_XCRS:
+		r = cpu_has_xsave;
+		break;
 	default:
 		r = 0;
 		break;
@@ -2355,6 +2359,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					 struct kvm_xsave *guest_xsave)
+{
+	if (cpu_has_xsave)
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->xsave,
+			sizeof(struct xsave_struct));
+	else {
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->fxsave,
+			sizeof(struct i387_fxsave_struct));
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+			XSTATE_FPSSE;
+	}
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+					struct kvm_xsave *guest_xsave)
+{
+	u64 xstate_bv =
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+	if (cpu_has_xsave)
+		memcpy(&vcpu->arch.guest_fpu.state->xsave,
+			guest_xsave->region, sizeof(struct xsave_struct));
+	else {
+		if (xstate_bv & ~XSTATE_FPSSE)
+			return -EINVAL;
+		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+	}
+	return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+					struct kvm_xcrs *guest_xcrs)
+{
+	if (!cpu_has_xsave) {
+		guest_xcrs->nr_xcrs = 0;
+		return;
+	}
+
+	guest_xcrs->nr_xcrs = 1;
+	guest_xcrs->flags = 0;
+	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+				       struct kvm_xcrs *guest_xcrs)
+{
+	int i, r = 0;
+
+	if (!cpu_has_xsave)
+		return -EINVAL;
+
+	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+		return -EINVAL;
+
+	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+		/* Only support XCR0 currently */
+		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+				guest_xcrs->xcrs[0].value);
+			break;
+		}
+	if (r)
+		r = -EINVAL;
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2556,6 +2631,70 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		break;
 	}
+	case KVM_GET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave);
+		break;
+	}
+	case KVM_GET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xcrs,
+				 sizeof(struct kvm_xcrs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xcrs, argp,
+				   sizeof(struct kvm_xcrs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
-- 
cgit v1.2.3


From ac3cd03cca91d481b41e8236aaa41a7f9fafa62f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:28:14 +0800
Subject: KVM: MMU: rename 'page' and 'shadow_page' to 'sp'

Rename 'page' and 'shadow_page' to 'sp' to better fit the context

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9308be2d5c0..e461f2393d8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -253,7 +253,7 @@ err:
 	return 0;
 }
 
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
 	pt_element_t gpte;
@@ -264,7 +264,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
 		if (!is_present_gpte(gpte)) {
-			if (page->unsync)
+			if (sp->unsync)
 				new_spte = shadow_trap_nonpresent_pte;
 			else
 				new_spte = shadow_notrap_nonpresent_pte;
@@ -273,7 +273,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 		return;
 	pfn = vcpu->arch.update_pte.pfn;
@@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
-	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
@@ -300,7 +300,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 int *ptwrite, pfn_t pfn)
 {
 	unsigned access = gw->pt_access;
-	struct kvm_mmu_page *shadow_page;
+	struct kvm_mmu_page *sp;
 	u64 spte, *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
@@ -341,9 +341,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				access &= ~ACC_WRITE_MASK;
 			/*
 			 * It is a large guest pages backed by small host pages,
-			 * So we set @direct(@shadow_page->role.direct)=1, and
-			 * set @table_gfn(@shadow_page->gfn)=the base page frame
-			 * for linear translations.
+			 * So we set @direct(@sp->role.direct)=1, and set
+			 * @table_gfn(@sp->gfn)=the base page frame for linear
+			 * translations.
 			 */
 			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 			access &= gw->pte_access;
@@ -351,21 +351,21 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 0;
 			table_gfn = gw->table_gfn[level - 2];
 		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 					       direct, access, sptep);
 		if (!direct) {
 			r = kvm_read_guest_atomic(vcpu->kvm,
 						  gw->pte_gpa[level - 2],
 						  &curr_pte, sizeof(curr_pte));
 			if (r || curr_pte != gw->ptes[level - 2]) {
-				kvm_mmu_put_page(shadow_page, sptep);
+				kvm_mmu_put_page(sp, sptep);
 				kvm_release_pfn_clean(pfn);
 				sptep = NULL;
 				break;
 			}
 		}
 
-		spte = __pa(shadow_page->spt)
+		spte = __pa(sp->spt)
 			| PT_PRESENT_MASK | PT_ACCESSED_MASK
 			| PT_WRITABLE_MASK | PT_USER_MASK;
 		*sptep = spte;
-- 
cgit v1.2.3


From cb83cad2e7e1cdedb2abb9cef2ac076defa679d4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:29:42 +0800
Subject: KVM: MMU: cleanup for dirty page judgment

Using wrap function to cleanup page dirty judgment

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e461f2393d8..efba353369e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -287,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
@@ -319,7 +319,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			mmu_set_spte(vcpu, sptep, access,
 				     gw->pte_access & access,
 				     user_fault, write_fault,
-				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+				     is_dirty_gpte(gw->ptes[gw->level-1]),
 				     ptwrite, level,
 				     gw->gfn, pfn, false, true);
 			break;
-- 
cgit v1.2.3


From f918b443527e98476c8cc45683152106b9e4bedc Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:30:36 +0800
Subject: KVM: MMU: avoid double write protected in sync page path

The sync page is already write protected in mmu_sync_children(), don't
write protected it again

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa98fca03ed..ff333572be7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1216,6 +1216,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
 			(sp)->role.invalid) {} else
 
+/* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   struct list_head *invalid_list, bool clear_unsync)
 {
@@ -1224,11 +1225,8 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return 1;
 	}
 
-	if (clear_unsync) {
-		if (rmap_write_protect(vcpu->kvm, sp->gfn))
-			kvm_flush_remote_tlbs(vcpu->kvm);
+	if (clear_unsync)
 		kvm_unlink_unsync_page(vcpu->kvm, sp);
-	}
 
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-- 
cgit v1.2.3


From be71e061d15c0aad4f8c2606f76c57b8a19792fd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:31:38 +0800
Subject: KVM: MMU: don't mark pte notrap if it's just sync transient

If the sync-sp just sync transient, don't mark its pte notrap

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 11 ++++-------
 arch/x86/kvm/paging_tmpl.h      |  5 +++--
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2813d658f3..2ec2e27a403 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -241,7 +241,7 @@ struct kvm_mmu {
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp);
+			 struct kvm_mmu_page *sp, bool clear_unsync);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	int root_level;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff333572be7..d1e09f3c561 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1103,7 +1103,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-			       struct kvm_mmu_page *sp)
+			       struct kvm_mmu_page *sp, bool clear_unsync)
 {
 	return 1;
 }
@@ -1228,7 +1228,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (clear_unsync)
 		kvm_unlink_unsync_page(vcpu->kvm, sp);
 
-	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
@@ -1237,7 +1237,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return 0;
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp);
 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu_page *sp)
 {
@@ -1245,9 +1244,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	int ret;
 
 	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-	if (!ret)
-		mmu_convert_notrap(sp);
-	else
+	if (ret)
 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 
 	return ret;
@@ -1273,7 +1270,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-			(vcpu->arch.mmu.sync_page(vcpu, s))) {
+			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
 			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
 			continue;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index efba353369e..863920f649f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -578,7 +578,8 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  *   can't change unless all sptes pointing to it are nuked first.
  * - Alias changes zap the entire shadow cache.
  */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    bool clear_unsync)
 {
 	int i, offset, nr_present;
 	bool reset_host_protection;
@@ -615,7 +616,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			u64 nonpresent;
 
 			rmap_remove(vcpu->kvm, &sp->spt[i]);
-			if (is_present_gpte(gpte))
+			if (is_present_gpte(gpte) || !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-- 
cgit v1.2.3


From ebdea638df04ae6293a9a5414d98ad843c69e82f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:32:34 +0800
Subject: KVM: MMU: cleanup for __mmu_unsync_walk()

Decrease sp->unsync_children after clear unsync_child_bitmap bit

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d1e09f3c561..41e801b5306 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1160,9 +1160,11 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 					return -ENOSPC;
 
 				ret = __mmu_unsync_walk(child, pvec);
-				if (!ret)
+				if (!ret) {
 					__clear_bit(i, sp->unsync_child_bitmap);
-				else if (ret > 0)
+					sp->unsync_children--;
+					WARN_ON((int)sp->unsync_children < 0);
+				} else if (ret > 0)
 					nr_unsync_leaf += ret;
 				else
 					return ret;
@@ -1176,8 +1178,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 		}
 	}
 
-	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
-		sp->unsync_children = 0;
 
 	return nr_unsync_leaf;
 }
-- 
cgit v1.2.3


From 7a8f1a74e4193d21e55b35928197486f2c047efb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:34:04 +0800
Subject: KVM: MMU: clear unsync_child_bitmap completely

In current code, some page's unsync_child_bitmap is not cleared completely
in mmu_sync_children(), for example, if two PDPEs shard one PDT, one of
PDPE's unsync_child_bitmap is not cleared.

Currently, it not harm anything just little overload, but it's the prepare
work for the later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 41e801b5306..ab12be4eb10 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1149,33 +1149,38 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	int i, ret, nr_unsync_leaf = 0;
 
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
+		struct kvm_mmu_page *child;
 		u64 ent = sp->spt[i];
 
-		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
-			struct kvm_mmu_page *child;
-			child = page_header(ent & PT64_BASE_ADDR_MASK);
-
-			if (child->unsync_children) {
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-
-				ret = __mmu_unsync_walk(child, pvec);
-				if (!ret) {
-					__clear_bit(i, sp->unsync_child_bitmap);
-					sp->unsync_children--;
-					WARN_ON((int)sp->unsync_children < 0);
-				} else if (ret > 0)
-					nr_unsync_leaf += ret;
-				else
-					return ret;
-			}
+		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+			goto clear_child_bitmap;
+
+		child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+		if (child->unsync_children) {
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+
+			ret = __mmu_unsync_walk(child, pvec);
+			if (!ret)
+				goto clear_child_bitmap;
+			else if (ret > 0)
+				nr_unsync_leaf += ret;
+			else
+				return ret;
+		} else if (child->unsync) {
+			nr_unsync_leaf++;
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+		} else
+			 goto clear_child_bitmap;
 
-			if (child->unsync) {
-				nr_unsync_leaf++;
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-			}
-		}
+		continue;
+
+clear_child_bitmap:
+		__clear_bit(i, sp->unsync_child_bitmap);
+		sp->unsync_children--;
+		WARN_ON((int)sp->unsync_children < 0);
 	}
 
 
-- 
cgit v1.2.3


From 1047df1fb682a41eb9885d6b3f2d04d6c8fd3756 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:35:15 +0800
Subject: KVM: MMU: don't walk every parent pages while mark unsync

While we mark the parent's unsync_child_bitmap, if the parent is already
unsynced, it no need walk it's parent, it can reduce some unnecessary
workload

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 61 +++++++++++++++---------------------------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ab12be4eb10..8c2f580956d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -175,7 +175,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -1024,7 +1024,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	BUG();
 }
 
-
 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 	struct kvm_pte_chain *pte_chain;
@@ -1034,63 +1033,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(parent_sp);
-		mmu_parent_walk(parent_sp, fn);
+		fn(parent_sp, sp->parent_pte);
 		return;
 	}
+
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
+			u64 *spte = pte_chain->parent_ptes[i];
+
+			if (!spte)
 				break;
-			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(parent_sp);
-			mmu_parent_walk(parent_sp, fn);
+			parent_sp = page_header(__pa(spte));
+			fn(parent_sp, spte);
 		}
 }
 
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	unsigned int index;
-	struct kvm_mmu_page *sp = page_header(__pa(spte));
-
-	index = spte - sp->spt;
-	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-		sp->unsync_children++;
-	WARN_ON(!sp->unsync_children);
+	mmu_parent_walk(sp, mark_unsync);
 }
 
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
+	unsigned int index;
 
-	if (!sp->parent_pte)
+	index = spte - sp->spt;
+	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
 		return;
-
-	if (!sp->multimapped) {
-		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+	if (sp->unsync_children++)
 		return;
-	}
-
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
-		}
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
-	kvm_mmu_update_parents_unsync(sp);
-	return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
-	mmu_parent_walk(sp, unsync_walk_fn);
-	kvm_mmu_update_parents_unsync(sp);
+	kvm_mmu_mark_parents_unsync(sp);
 }
 
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From bd371396b38ffc4bd6444b0203f33b99d18cedd0 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Mon, 14 Jun 2010 11:42:15 -1000
Subject: KVM: x86: fix -DDEBUG oops

Fix a slight error with assertion in local APIC code.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d8258a0060f..024f6d1c299 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -329,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 
-	ASSERT(!target);
+	ASSERT(target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
 		if (dest_mode == 0)
-- 
cgit v1.2.3


From c37eda138473f8c843f2b4aa8da252fdfdaaafa3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 15 Jun 2010 09:03:33 +0800
Subject: KVM: x86 emulator: fix pusha instruction emulation

emulate pusha instruction only writeback the last
EDI register, but the other registers which need
to be writeback is ignored. This patch fixed it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 133 +++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a4c2dcd1032..c990db0a3a0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1553,6 +1553,64 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+	u32 err;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
+		case 1:
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			break;
+		case 2:
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			break;
+		case 4:
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
+			break;
+		}
+		break;
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		else
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt,
+					      (unsigned long)c->dst.ptr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		break;
+	case OP_NONE:
+		/* no writeback */
+		break;
+	default:
+		break;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
@@ -1651,11 +1709,12 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
 			  struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 
 	while (reg <= VCPU_REGS_RDI) {
@@ -1663,8 +1722,18 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
 		emulate_push(ctxt, ops);
+
+		rc = writeback(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+
 		++reg;
 	}
+
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+
+	return rc;
 }
 
 static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1817,64 +1886,6 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
-{
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-	u32 err;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
-	default:
-		break;
-	}
-	return X86EMUL_CONTINUE;
-}
-
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2689,7 +2700,9 @@ special_insn:
 			goto done;
 		break;
 	case 0x60:	/* pusha */
-		emulate_pusha(ctxt, ops);
+		rc = emulate_pusha(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-- 
cgit v1.2.3


From 33572ac0ad5ba5016da72e6654e607726568f9c0 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 16 Jun 2010 17:11:11 -0400
Subject: KVM: x86: Introduce a workqueue to deliver PIT timer interrupts

We really want to "kvm_set_irq" during the hrtimer callback,
but that is risky because that is during interrupt context.
Instead, offload the work to a workqueue, which is a bit safer
and should provide most of the same functionality.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 141 +++++++++++++++++++++++++++++++--------------------
 arch/x86/kvm/i8254.h |   4 +-
 arch/x86/kvm/irq.c   |   1 -
 3 files changed, 88 insertions(+), 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 188d82762c1..467cc47fb73 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,6 +34,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 #include "irq.h"
 #include "i8254.h"
@@ -244,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	raw_spin_lock(&ps->inject_lock);
-	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+	int value;
+
+	spin_lock(&ps->inject_lock);
+	value = atomic_dec_return(&ps->pit_timer.pending);
+	if (value < 0)
+		/* spurious acks can be generated if, for example, the
+		 * PIC is being reset.  Handle it gracefully here
+		 */
 		atomic_inc(&ps->pit_timer.pending);
+	else if (value > 0)
+		/* in this case, we had multiple outstanding pit interrupts
+		 * that we needed to inject.  Reinject
+		 */
+		queue_work(ps->pit->wq, &ps->pit->expired);
 	ps->irq_ack = 1;
-	raw_spin_unlock(&ps->inject_lock);
+	spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -264,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
 {
-	pr_debug("execute del timer!\n");
-	hrtimer_cancel(&pt->timer);
+	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+	cancel_work_sync(&pit->expired);
 }
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -281,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
+static void pit_do_work(struct work_struct *work)
+{
+	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+	struct kvm *kvm = pit->kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	int inject = 0;
+
+	/* Try to inject pending interrupts when
+	 * last one has been acked.
+	 */
+	spin_lock(&ps->inject_lock);
+	if (ps->irq_ack) {
+		ps->irq_ack = 0;
+		inject = 1;
+	}
+	spin_unlock(&ps->inject_lock);
+	if (inject) {
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+		/*
+		 * Provides NMI watchdog support via Virtual Wire mode.
+		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+		 *
+		 * Note: Our Virtual Wire implementation is simplified, only
+		 * propagating PIT interrupts to all VCPUs when they have set
+		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+		 * VCPU0, and only if its LVT0 is in EXTINT mode.
+		 */
+		if (kvm->arch.vapics_in_nmi_mode > 0)
+			kvm_for_each_vcpu(i, vcpu, kvm)
+				kvm_apic_nmi_wd_deliver(vcpu);
+	}
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+		atomic_inc(&ktimer->pending);
+		queue_work(pt->wq, &pt->expired);
+	}
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		return HRTIMER_RESTART;
+	} else
+		return HRTIMER_NORESTART;
+}
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
 	struct kvm_timer *pt = &ps->pit_timer;
@@ -292,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
+	cancel_work_sync(&ps->pit->expired);
 	pt->period = interval;
 	ps->is_periodic = is_period;
 
-	pt->timer.function = kvm_timer_fn;
+	pt->timer.function = pit_timer_fn;
 	pt->t_ops = &kpit_ops;
 	pt->kvm = ps->pit->kvm;
-	pt->vcpu = pt->kvm->bsp_vcpu;
 
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
@@ -347,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 		}
 		break;
 	default:
-		destroy_pit_timer(&ps->pit_timer);
+		destroy_pit_timer(kvm->arch.vpit);
 	}
 }
 
@@ -626,7 +692,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	raw_spin_lock_init(&pit->pit_state.inject_lock);
+	spin_lock_init(&pit->pit_state.inject_lock);
+
+	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+	if (!pit->wq) {
+		kfree(pit);
+		return NULL;
+	}
+	INIT_WORK(&pit->expired, pit_do_work);
 
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
@@ -685,54 +758,10 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
+		cancel_work_sync(&kvm->arch.vpit->expired);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		destroy_workqueue(kvm->arch.vpit->wq);
 		kfree(kvm->arch.vpit);
 	}
 }
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
-	/*
-	 * Provides NMI watchdog support via Virtual Wire mode.
-	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-	 *
-	 * Note: Our Virtual Wire implementation is simplified, only
-	 * propagating PIT interrupts to all VCPUs when they have set
-	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-	 * VCPU0, and only if its LVT0 is in EXTINT mode.
-	 */
-	if (kvm->arch.vapics_in_nmi_mode > 0)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_kpit_state *ps;
-
-	if (pit) {
-		int inject = 0;
-		ps = &pit->pit_state;
-
-		/* Try to inject pending interrupts when
-		 * last one has been acked.
-		 */
-		raw_spin_lock(&ps->inject_lock);
-		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
-			ps->irq_ack = 0;
-			inject = 1;
-		}
-		raw_spin_unlock(&ps->inject_lock);
-		if (inject)
-			__inject_pit_timer_intr(kvm);
-	}
-}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c..46d08ca0b48 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	raw_spinlock_t inject_lock;
+	spinlock_t inject_lock;
 	unsigned long irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
@@ -40,6 +40,8 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
+	struct workqueue_struct *wq;
+	struct work_struct expired;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 0f4e488331c..2095a049835 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_apic_timer_irqs(vcpu);
-	kvm_inject_pit_timer_irqs(vcpu);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-- 
cgit v1.2.3


From e7dca5c0eba63e4ba8e3586c4b37863fd7fadb5a Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 16 Jun 2010 17:11:12 -0400
Subject: KVM: x86: Allow any LAPIC to accept PIC interrupts

If the guest wants to accept timer interrupts on a CPU other
than the BSP, we need to remove this gate.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 024f6d1c299..49573c78c24 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1107,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	int r = 0;
 
-	if (kvm_vcpu_is_bsp(vcpu)) {
-		if (!apic_hw_enabled(vcpu->arch.apic))
-			r = 1;
-		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-			r = 1;
-	}
+	if (!apic_hw_enabled(vcpu->arch.apic))
+		r = 1;
+	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+		r = 1;
 	return r;
 }
 
-- 
cgit v1.2.3


From 7d5993d63f2bac75b89e171a7098044ec4bc701f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 17 Jun 2010 17:33:55 +0800
Subject: KVM: x86 emulator: fix group3 instruction decoding

Group 3 instruction with ModRM reg field as 001 is
defined as test instruction under AMD arch, and
emulate_grp3() is ready for emulate it, so fix the
decoding.

static inline int emulate_grp3(...)
{
	...
	switch (c->modrm_reg) {
	case 0 ... 1:   /* test */
		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
	...
}

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c990db0a3a0..abb8cec420a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -336,11 +336,11 @@ static u32 group_table[] = {
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, 0,
+	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM, 0,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-- 
cgit v1.2.3


From a1a005f36e0defea7c5490772c318c6af2261d31 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 20 Jun 2010 15:47:34 +0300
Subject: KVM: Fix xsave and xcr save/restore memory leak

We allocate temporary kernel buffers for these structures, but never free them.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c8dc9614e7..d918cb15e5b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2437,6 +2437,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
+	struct kvm_xsave *xsave = NULL;
+	struct kvm_xcrs *xcrs = NULL;
 
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
@@ -2632,8 +2634,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		struct kvm_xsave *xsave;
-
 		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xsave)
@@ -2648,8 +2648,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		struct kvm_xsave *xsave;
-
 		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xsave)
@@ -2663,8 +2661,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XCRS: {
-		struct kvm_xcrs *xcrs;
-
 		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xcrs)
@@ -2680,8 +2676,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XCRS: {
-		struct kvm_xcrs *xcrs;
-
 		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xcrs)
@@ -2700,6 +2694,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 out:
 	kfree(lapic);
+	kfree(xsave);
+	kfree(xcrs);
 	return r;
 }
 
-- 
cgit v1.2.3


From d1ac91d8a2f00dc6a3954f7e8971339b0893edc4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 20 Jun 2010 15:54:43 +0300
Subject: KVM: Consolidate load/save temporary buffer allocation and freeing

Instead of three temporary variables and three free calls, have one temporary
variable (with four names) and one free call.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d918cb15e5b..8e60b6c9c0b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2436,25 +2436,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
-	struct kvm_lapic_state *lapic = NULL;
-	struct kvm_xsave *xsave = NULL;
-	struct kvm_xcrs *xcrs = NULL;
+	union {
+		struct kvm_lapic_state *lapic;
+		struct kvm_xsave *xsave;
+		struct kvm_xcrs *xcrs;
+		void *buffer;
+	} u;
 
+	u.buffer = NULL;
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 		r = 0;
 		break;
@@ -2463,14 +2467,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
 		r = -EFAULT;
-		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
 			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
 		r = 0;
@@ -2634,68 +2638,66 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xsave)
+		if (!u.xsave)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave);
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 
 		r = -EFAULT;
-		if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave)))
+		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xsave)
+		if (!u.xsave)
 			break;
 
 		r = -EFAULT;
-		if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave)))
+		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
 			break;
 
-		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave);
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
 	case KVM_GET_XCRS: {
-		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xcrs)
+		if (!u.xcrs)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs);
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 
 		r = -EFAULT;
-		if (copy_to_user(argp, xcrs,
+		if (copy_to_user(argp, u.xcrs,
 				 sizeof(struct kvm_xcrs)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XCRS: {
-		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xcrs)
+		if (!u.xcrs)
 			break;
 
 		r = -EFAULT;
-		if (copy_from_user(xcrs, argp,
+		if (copy_from_user(u.xcrs, argp,
 				   sizeof(struct kvm_xcrs)))
 			break;
 
-		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs);
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
 	}
 	default:
 		r = -EINVAL;
 	}
 out:
-	kfree(lapic);
-	kfree(xsave);
-	kfree(xcrs);
+	kfree(u.buffer);
 	return r;
 }
 
-- 
cgit v1.2.3


From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 11:44:20 +0300
Subject: KVM: Remove memory alias support

As advertised in feature-removal-schedule.txt.  Equivalent support is provided
by overlapping memory regions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  21 -------
 arch/x86/kvm/mmu.c              |  17 ++----
 arch/x86/kvm/paging_tmpl.h      |   3 +-
 arch/x86/kvm/x86.c              | 125 ----------------------------------------
 arch/x86/kvm/x86.h              |   7 ---
 5 files changed, 5 insertions(+), 168 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2ec2e27a403..a57cdeacc4d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -69,8 +69,6 @@
 
 #define IOPL_SHIFT 12
 
-#define KVM_ALIAS_SLOTS 4
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
@@ -362,24 +360,7 @@ struct kvm_vcpu_arch {
 	u64 hv_vapic;
 };
 
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-	unsigned long flags;
-};
-
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
-};
-
 struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
-
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
@@ -655,8 +636,6 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8c2f580956d..c5501bc1010 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm,
 	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot, level);
 		return *largepage_idx;
@@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 
 /*
  * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
  */
 
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	if (!is_rmap_spte(*spte))
 		return count;
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	u64 *spte;
 	int i, write_protected = 0;
 
-	gfn = unalias_gfn(kvm, gfn);
 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
 	spte = rmap_next(kvm, rmapp, NULL);
@@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->unsync)
 			continue;
 
-		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 863920f649f..a21a86ef9e2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -576,7 +576,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			    bool clear_unsync)
@@ -611,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
+		if (gfn != sp->gfns[i] ||
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e60b6c9c0b..62596d373a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2740,115 +2740,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 	return kvm->arch.n_alloc_mmu_pages;
 }
 
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (alias->flags & KVM_ALIAS_INVALID)
-			continue;
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
-{
-	int r, n;
-	struct kvm_mem_alias *p;
-	struct kvm_mem_aliases *aliases, *old_aliases;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out;
-
-	mutex_lock(&kvm->slots_lock);
-
-	/* invalidate any gfn reference in case of deletion/shrinking */
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kvm_mmu_zap_all(kvm);
-	kfree(old_aliases);
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out_unlock;
-
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
-	p = &aliases->aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-	p->flags &= ~(KVM_ALIAS_INVALID);
-
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (aliases->aliases[n - 1].npages)
-			break;
-	aliases->naliases = n;
-
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kfree(old_aliases);
-	r = 0;
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-out:
-	return r;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
@@ -3056,7 +2947,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_pit_state2 ps2;
-		struct kvm_memory_alias alias;
 		struct kvm_pit_config pit_config;
 	} u;
 
@@ -3101,14 +2991,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
-	case KVM_SET_MEMORY_ALIAS:
-		r = -EFAULT;
-		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
-		if (r)
-			goto out;
-		break;
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_pic *vpic;
 
@@ -5559,12 +5441,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!kvm->arch.aliases) {
-		kfree(kvm);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5622,7 +5498,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
 	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm->arch.aliases);
 	kfree(kvm);
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285..b7a404722d2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
-	return rcu_dereference_check(kvm->arch.aliases,
-			srcu_read_lock_held(&kvm->srcu)
-			|| lockdep_is_held(&kvm->slots_lock));
-}
-
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
-- 
cgit v1.2.3


From b74a07beed0e64bfba413dcb70dd6749c57f43dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 11:48:05 +0300
Subject: KVM: Remove kernel-allocated memory regions

Equivalent (and better) functionality is provided by user-allocated memory
regions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 62596d373a4..9be6e4e5e8e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2967,22 +2967,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		kvm_userspace_mem.slot = kvm_mem.slot;
-		kvm_userspace_mem.flags = kvm_mem.flags;
-		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-		if (r)
-			goto out;
-		break;
-	}
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		if (r)
-- 
cgit v1.2.3


From 073d46133ab0b42154f6b8429f4f66dbe2760bda Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 17:34:34 +0300
Subject: KVM: i8259: reduce excessive abstraction for pic_irq_request()

Part of the i8259 code pretends it isn't part of kvm, but we know better.
Reduce excessive abstraction, eliminating callbacks and void pointers.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 17 +++++++----------
 arch/x86/kvm/irq.h   |  4 ----
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 2c73f449314..caf6e1b3d95 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,8 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_irq_request(struct kvm *kvm, int level);
+
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 {
@@ -175,9 +177,9 @@ static void pic_update_irq(struct kvm_pic *s)
 	}
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0)
-		s->irq_request(s->irq_request_opaque, 1);
+		pic_irq_request(s->kvm, 1);
 	else
-		s->irq_request(s->irq_request_opaque, 0);
+		pic_irq_request(s->kvm, 0);
 }
 
 void kvm_pic_update_irq(struct kvm_pic *s)
@@ -262,8 +264,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq;
-	struct kvm *kvm = s->pics_state->irq_request_opaque;
-	struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
 	u8 irr = s->irr, isr = s->imr;
 
 	s->last_irr = 0;
@@ -302,8 +303,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			/*
 			 * deassert a pending interrupt
 			 */
-			s->pics_state->irq_request(s->pics_state->
-						   irq_request_opaque, 0);
+			pic_irq_request(s->pics_state->kvm, 0);
 			s->init_state = 1;
 			s->init4 = val & 1;
 			if (val & 0x02)
@@ -519,9 +519,8 @@ static int picdev_read(struct kvm_io_device *this,
 /*
  * callback when PIC0 irq status changed
  */
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
 {
-	struct kvm *kvm = opaque;
 	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_pic *s = pic_irqchip(kvm);
 	int irq = pic_get_irq(&s->pics[0]);
@@ -550,8 +549,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
-	s->irq_request = pic_irq_request;
-	s->irq_request_opaque = kvm;
 	s->pics[0].pics_state = s;
 	s->pics[1].pics_state = s;
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413..ffed06871c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
 struct kvm;
 struct kvm_vcpu;
 
-typedef void irq_request_func(void *opaque, int level);
-
 struct kvm_kpic_state {
 	u8 last_irr;	/* edge detection */
 	u8 irr;		/* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-	irq_request_func *irq_request;
-	void *irq_request_opaque;
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
 	void (*ack_notifier)(void *opaque, int irq);
-- 
cgit v1.2.3


From 36633f32ba4c238403d19584754b30fe469d6dcb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 17:38:06 +0300
Subject: KVM: i8259: simplify pic_irq_request() calling sequence

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index caf6e1b3d95..bc10f0bd381 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -176,10 +176,7 @@ static void pic_update_irq(struct kvm_pic *s)
 		pic_set_irq1(&s->pics[0], 2, 0);
 	}
 	irq = pic_get_irq(&s->pics[0]);
-	if (irq >= 0)
-		pic_irq_request(s->kvm, 1);
-	else
-		pic_irq_request(s->kvm, 0);
+	pic_irq_request(s->kvm, irq >= 0);
 }
 
 void kvm_pic_update_irq(struct kvm_pic *s)
-- 
cgit v1.2.3


From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 12:34:53 +0300
Subject: KVM: Add mini-API for vcpu->requests

Makes it a little more readable and hackable.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c |  2 +-
 arch/x86/kvm/mmu.c   |  6 +++---
 arch/x86/kvm/svm.c   |  2 +-
 arch/x86/kvm/timer.c |  2 +-
 arch/x86/kvm/vmx.c   |  2 +-
 arch/x86/kvm/x86.c   | 27 +++++++++++++--------------
 6 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 49573c78c24..77d8c0f4817 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -534,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_run *run = vcpu->run;
 
-	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
 	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c5501bc1010..690a7fc58c1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 		if (sp->unsync_children) {
-			set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 			kvm_mmu_mark_parents_unsync(sp);
 		} else if (sp->unsync)
 			kvm_mmu_mark_parents_unsync(sp);
@@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	int ret = 0;
 
 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		ret = 1;
 	}
 
@@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f7a6fdcf8ef..587b99d37d4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1494,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
 		 */
 		pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
 
 		return;
 	}
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 564548fbb3d..e16a0dbe74d 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -32,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
 		/* FIXME: this code should not know anything about vcpus */
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 	}
 
 	if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 345a3547051..661c6e199b4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -899,7 +899,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		unsigned long sysenter_esp;
 
 		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9be6e4e5e8e..7ef44107a14 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -296,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	class1 = exception_class(prev_nr);
@@ -948,7 +948,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 	if (!vcpu->time_page)
 		return 0;
-	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
 	return 1;
 }
 
@@ -2253,7 +2253,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 			printk(KERN_DEBUG "kvm: set_mce: "
 			       "injects mce exception while "
 			       "previous one is in progress!\n");
-			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 			return 0;
 		}
 		if (banks[1] & MCI_STATUS_VAL)
@@ -4617,7 +4617,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->run->request_interrupt_window;
 
 	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 
 	r = kvm_mmu_reload(vcpu);
@@ -4625,26 +4625,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto out;
 
 	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
 			kvm_write_guest_time(vcpu);
-		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_x86_ops->tlb_flush(vcpu);
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-				       &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
@@ -4773,7 +4772,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			{
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
@@ -5255,7 +5254,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
-	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 }
 
-- 
cgit v1.2.3


From 7ac77099ce88a0c31b75acd0ec5ef3da4415a6d8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 10:57:45 +0300
Subject: KVM: Prevent internal slots from being COWed

If a process with a memory slot is COWed, the page will change its address
(despite having an elevated reference count).  This breaks internal memory
slots which have their physical addresses loaded into vmcs registers (see
the APIC access memory slot).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ef44107a14..68be38e233f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5491,6 +5491,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				int user_alloc)
 {
 	int npages = memslot->npages;
+	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	/* Prevent internal slot pages from being moved by fork()/COW. */
+	if (memslot->id >= KVM_MEMORY_SLOTS)
+		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
 	/*To keep backward compatibility with older userspace,
 	 *x86 needs to hanlde !user_alloc case.
@@ -5503,7 +5508,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			userspace_addr = do_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
-						 MAP_PRIVATE | MAP_ANONYMOUS,
+						 map_flags,
 						 0);
 			up_write(&current->mm->mmap_sem);
 
-- 
cgit v1.2.3


From 6c3f6041172b78d5532c6bf3680d304e92ec2e66 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 22 Jun 2010 13:49:21 +0800
Subject: KVM: x86: Enable AVX for guest

Enable Intel(R) Advanced Vector Extension(AVX) for guest.

The detection of AVX feature includes OSXSAVE bit testing. When OSXSAVE bit is
not set, even if AVX is supported, the AVX instruction would result in UD as
well. So we're safe to expose AVX bits to guest directly.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 68be38e233f..d39d6b25d3e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1963,13 +1963,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
-- 
cgit v1.2.3


From 529df65e394e30a78f2633b575fd81fa5b973e30 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Mon, 21 Jun 2010 11:29:40 -0400
Subject: KVM: Search the LAPIC's for one that will accept a PIC interrupt

Older versions of 32-bit linux have a "Checking 'hlt' instruction"
test where they repeatedly call the 'hlt' instruction, and then
expect a timer interrupt to kick the CPU out of halt.  This happens
before any LAPIC or IOAPIC setup happens, which means that all of
the APIC's are in virtual wire mode at this point.  Unfortunately,
the current implementation of virtual wire mode is hardcoded to
only kick the BSP, so if a crash+kexec occurs on a different
vcpu, it will never get kicked.

This patch makes pic_unlock() do the equivalent of
kvm_irq_delivery_to_apic() for the IOAPIC code.  That is, it runs
through all of the vcpus looking for one that is in virtual wire
mode.  In the normal case where LAPICs and IOAPICs are configured,
this won't be used at all.  In the bootstrap phase of a modern
OS, before the LAPICs and IOAPICs are configured, this will have
exactly the same behavior as today; VCPU0 is always looked at
first, so it will always get out of the loop after the first
iteration.  This will only go through the loop more than once
during a kexec/kdump, in which case it will only do it a few times
until the kexec'ed kernel programs the LAPIC and IOAPIC.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bc10f0bd381..819b748a33f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -46,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
 	__releases(&s->lock)
 {
 	bool wakeup = s->wakeup_needed;
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *found = NULL;
+	int i;
 
 	s->wakeup_needed = false;
 
 	raw_spin_unlock(&s->lock);
 
 	if (wakeup) {
-		vcpu = s->kvm->bsp_vcpu;
-		if (vcpu)
-			kvm_vcpu_kick(vcpu);
+		kvm_for_each_vcpu(i, vcpu, s->kvm) {
+			if (kvm_apic_accept_pic_intr(vcpu)) {
+				found = vcpu;
+				break;
+			}
+		}
+
+		if (!found)
+			found = s->kvm->bsp_vcpu;
+
+		kvm_vcpu_kick(found);
 	}
 }
 
-- 
cgit v1.2.3


From 3e0075094734de122e4cb09f930fa853a3c59f09 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 23 Jun 2010 14:26:18 +0300
Subject: KVM: Simplify vcpu_enter_guest() mmu reload logic slightly

No need to reload the mmu in between two different vcpu->requests checks.

kvm_mmu_reload() may trigger KVM_REQ_TRIPLE_FAULT, but that will be caught
during atomic guest entry later.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d39d6b25d3e..27322d34123 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4616,15 +4616,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 
-	if (vcpu->requests)
+	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
-
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
-	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
@@ -4649,6 +4643,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-- 
cgit v1.2.3


From f5f48ee15c2ee3e44cf429e34b16c6fa9b900246 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 30 Jun 2010 12:25:15 +0800
Subject: KVM: VMX: Execute WBINVD to keep data consistency with assigned
 devices

Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/emulate.c          |  5 ++++-
 arch/x86/kvm/svm.c              |  7 +++++++
 arch/x86/kvm/vmx.c              | 10 +++++++++-
 arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdeacc4d..2bda62485c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
+#include <linux/cpumask.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
 
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
+
+	cpumask_var_t wbinvd_dirty_mask;
 };
 
 struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
+	bool (*has_wbinvd_exit)(void);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec420a..e8bdddc4509 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
-	case 0x08:		/* invd */
 	case 0x09:		/* wbinvd */
+		kvm_emulate_wbinvd(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d37d4..56c9b6bd765 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
 	return false;
 }
 
+static bool svm_has_wbinvd_exit(void)
+{
+	return true;
+}
+
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.rdtscp_supported = svm_rdtscp_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
+
+	.has_wbinvd_exit = svm_has_wbinvd_exit,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 661c6e199b4..4dfb1dc09c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_WBINVD_EXITING;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -3397,7 +3403,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
+	kvm_emulate_wbinvd(vcpu);
 	return 1;
 }
 
@@ -4347,6 +4353,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.rdtscp_supported = vmx_rdtscp_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
+
+	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27322d34123..3d72fc06705 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,28 @@ out:
 	return r;
 }
 
+static void wbinvd_ipi(void *garbage)
+{
+	wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	/* Address WBINVD may be executed by guest */
+	if (need_emulate_wbinvd(vcpu)) {
+		if (kvm_x86_ops->has_wbinvd_exit())
+			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+			smp_call_function_single(vcpu->cpu,
+					wbinvd_ipi, NULL, 1);
+	}
+
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
 		unsigned long khz = cpufreq_quick_get(cpu);
@@ -3660,6 +3680,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 	return X86EMUL_CONTINUE;
 }
 
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	if (!need_emulate_wbinvd(vcpu))
+		return X86EMUL_CONTINUE;
+
+	if (kvm_x86_ops->has_wbinvd_exit()) {
+		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+				wbinvd_ipi, NULL, 1);
+		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+	}
+	wbinvd();
+	return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -5263,6 +5298,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 	}
 
+	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
@@ -5392,7 +5428,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+		goto fail_free_mce_banks;
+
 	return 0;
+fail_free_mce_banks:
+	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
-- 
cgit v1.2.3


From 36a2e6774bfb5f32a0f23bb155f1f960321f291b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:02:02 +0800
Subject: KVM: MMU: fix writable sync sp mapping

While we sync many unsync sp at one time(in mmu_sync_children()),
we may mapping the spte writable, it's dangerous, if one unsync
sp's mapping gfn is another unsync page's gfn.

For example:

SP1.pte[0] = P
SP2.gfn's pfn = P
[SP1.pte[0] = SP2.gfn's pfn]

First, we write protected SP1 and SP2, but SP1 and SP2 are still the
unsync sp.

Then, sync SP1 first, it will detect SP1.pte[0].gfn only has one unsync-sp,
that is SP2, so it will mapping it writable, but we plan to sync SP2 soon,
at this point, the SP2->unsync is not reliable since later we sync SP2 but
SP2->gfn is already writable.

So the final result is: SP2 is the sync page but SP2.gfn is writable.

This bug will corrupt guest's page table, fixed by mark read-only mapping
if the mapped gfn has shadow pages.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 690a7fc58c1..ca07ed083b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1810,11 +1810,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 	bool need_unsync = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+		if (!can_unsync)
+			return 1;
+
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
 		if (!need_unsync && !s->unsync) {
-			if (!can_unsync || !oos_shadow)
+			if (!oos_shadow)
 				return 1;
 			need_unsync = true;
 		}
-- 
cgit v1.2.3


From 5fd5387c89ec99ff6cb82d2477ffeb7211b781c2 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:02:45 +0800
Subject: KVM: MMU: fix conflict access permissions in direct sp

In no-direct mapping, we mark sp is 'direct' when we mapping the
guest's larger page, but its access is encoded form upper page-struct
entire not include the last mapping, it will cause access conflict.

For example, have this mapping:
        [W]
      / PDE1 -> |---|
  P[W]          |   | LPA
      \ PDE2 -> |---|
        [R]

P have two children, PDE1 and PDE2, both PDE1 and PDE2 mapping the
same lage page(LPA). The P's access is WR, PDE1's access is WR,
PDE2's access is RO(just consider read-write permissions here)

When guest access PDE1, we will create a direct sp for LPA, the sp's
access is from P, is W, then we will mark the ptes is W in this sp.

Then, guest access PDE2, we will find LPA's shadow page, is the same as
PDE's, and mark the ptes is RO.

So, if guest access PDE1, the incorrect #PF is occured.

Fixed by encode the last mapping access into direct shadow page

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a21a86ef9e2..f4e4aaa65ff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -339,6 +339,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 1;
 			if (!is_dirty_gpte(gw->ptes[level - delta]))
 				access &= ~ACC_WRITE_MASK;
+			access &= gw->pte_access;
+
 			/*
 			 * It is a large guest pages backed by small host pages,
 			 * So we set @direct(@sp->role.direct)=1, and set
-- 
cgit v1.2.3


From 9e7b0e7fba45ca3c6357aeb7091ebc281f1de365 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:03:28 +0800
Subject: KVM: MMU: fix direct sp's access corrupted

If the mapping is writable but the dirty flag is not set, we will find
the read-only direct sp and setup the mapping, then if the write #PF
occur, we will mark this mapping writable in the read-only direct sp,
now, other real read-only mapping will happily write it without #PF.

It may hurt guest's COW

Fixed by re-install the mapping when write #PF occur.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f4e4aaa65ff..117d63f6304 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -325,8 +325,32 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			break;
 		}
 
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-			continue;
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+			struct kvm_mmu_page *child;
+			unsigned direct_access;
+
+			if (level != gw->level)
+				continue;
+
+			/*
+			 * For the direct sp, if the guest pte's dirty bit
+			 * changed form clean to dirty, it will corrupt the
+			 * sp's access: allow writable in the read-only sp,
+			 * so we should update the spte at this point to get
+			 * a new sp with the correct access.
+			 */
+			direct_access = gw->pt_access & gw->pte_access;
+			if (!is_dirty_gpte(gw->ptes[gw->level - 1]))
+				direct_access &= ~ACC_WRITE_MASK;
+
+			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+			if (child->role.access == direct_access)
+				continue;
+
+			mmu_page_remove_parent_pte(child, sptep);
+			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
+		}
 
 		if (is_large_pte(*sptep)) {
 			rmap_remove(vcpu->kvm, sptep);
-- 
cgit v1.2.3


From 84754cd8fca66ed476585eabad68cacf42834199 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:05:00 +0800
Subject: KVM: MMU: cleanup FNAME(fetch)() functions

Cleanup this function that we are already get the direct sp's access

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 117d63f6304..59e750c1a26 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -306,12 +306,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	gfn_t table_gfn;
 	int r;
 	int level;
+	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+	unsigned direct_access;
 	pt_element_t curr_pte;
 	struct kvm_shadow_walk_iterator iterator;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
 
+	direct_access = gw->pt_access & gw->pte_access;
+	if (!dirty)
+		direct_access &= ~ACC_WRITE_MASK;
+
 	for_each_shadow_entry(vcpu, addr, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
@@ -319,15 +325,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			mmu_set_spte(vcpu, sptep, access,
 				     gw->pte_access & access,
 				     user_fault, write_fault,
-				     is_dirty_gpte(gw->ptes[gw->level-1]),
-				     ptwrite, level,
+				     dirty, ptwrite, level,
 				     gw->gfn, pfn, false, true);
 			break;
 		}
 
 		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
 			struct kvm_mmu_page *child;
-			unsigned direct_access;
 
 			if (level != gw->level)
 				continue;
@@ -339,10 +343,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 * so we should update the spte at this point to get
 			 * a new sp with the correct access.
 			 */
-			direct_access = gw->pt_access & gw->pte_access;
-			if (!is_dirty_gpte(gw->ptes[gw->level - 1]))
-				direct_access &= ~ACC_WRITE_MASK;
-
 			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
 			if (child->role.access == direct_access)
 				continue;
@@ -359,11 +359,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 
 		if (level <= gw->level) {
-			int delta = level - gw->level + 1;
 			direct = 1;
-			if (!is_dirty_gpte(gw->ptes[level - delta]))
-				access &= ~ACC_WRITE_MASK;
-			access &= gw->pte_access;
+			access = direct_access;
 
 			/*
 			 * It is a large guest pages backed by small host pages,
-- 
cgit v1.2.3


From 828554136bbacae6e39fc31b9cd7e7c660ad7530 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 1 Jul 2010 16:00:11 +0200
Subject: KVM: Remove unnecessary divide operations

This patch converts unnecessary divide and modulo operations
in the KVM large page related code into logical operations.
This allows to convert gfn_t to u64 while not breaking 32
bit builds.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ++-
 arch/x86/kvm/mmu.c              | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2bda62485c4..50c79b9f5c3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -44,7 +44,8 @@
 
 /* KVM Hugepage definitions for x86 */
 #define KVM_NR_PAGE_SIZES	3
-#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ca07ed083b5..a20fd613acf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -423,8 +423,8 @@ static int *slot_largepage_idx(gfn_t gfn,
 {
 	unsigned long idx;
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 	return &slot->lpage_info[level - 2][idx].write_count;
 }
 
@@ -528,8 +528,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-		(slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 
 	return &slot->lpage_info[level - 2][idx].rmap_pde;
 }
-- 
cgit v1.2.3


From c15a5958a0b6dbf06b3c05972694f04a0c50a4cf Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 31 Jul 2010 12:48:22 -0400
Subject: x86-64, asm: Directly access per-cpu IST

Use a direct per-cpu reference for the IST instead of using a scratch
register.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1280594903-6341-1-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ff..59af275b37a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
 .macro paranoidzeroentry_ist sym do_sym ist
 ENTRY(\sym)
 	INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
 	xorl %esi,%esi		/* no error code */
-	PER_CPU(init_tss, %r12)
-	subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
 	call \do_sym
-	addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
 	jmp paranoid_exit	/* %ebx: no swapgs flag */
 	CFI_ENDPROC
 END(\sym)
-- 
cgit v1.2.3


From 72c511dd596cff88d6523f231a0fbb8f73006d51 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 31 Jul 2010 12:48:23 -0400
Subject: x86-32, asm: Directly access per-cpu GDT

Use a direct per-cpu reference for the GDT instead of using a scratch
register.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1280594903-6341-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_32.S | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..233c5829e7a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
  * compensating for the offset by changing to the ESPFIX segment with
  * a base address that matches for the difference.
  */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
 	mov %esp, %edx			/* load kernel esp */
 	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
 	mov %dx, %ax			/* eax: new kernel esp */
 	sub %eax, %edx			/* offset (low word is 0) */
-	PER_CPU(gdt_page, %ebx)
 	shr $16, %edx
-	mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
-	mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
+	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
 	pushl $__ESPFIX_SS
 	CFI_ADJUST_CFA_OFFSET 4
 	push %eax			/* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
  * normal stack and adjusts ESP with the matching offset.
  */
 	/* fixup the stack */
-	PER_CPU(gdt_page, %ebx)
-	mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
-	mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
 	shl $16, %eax
 	addl %esp, %eax			/* the adjusted stack pointer */
 	pushl $__KERNEL_DS
-- 
cgit v1.2.3


From cc05152ab72d7a65e6ea97d286af4f878c8f7371 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sat, 31 Jul 2010 22:51:01 +0200
Subject: x86,mmiotrace: Add support for tracing STOS instruction

Add support for stos access tracing with mmiotrace.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Nouveau <nouveau@lists.freedesktop.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20100731205101.GA5860@joi.lan>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/mm/pf_in.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 308e32570d8..38e6d174c49 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = {
 static unsigned int reg_rop[] = {
 	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
 };
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
 static unsigned int imm_wop[] = { 0xC6, 0xC7 };
 /* IA32 Manual 3, 3-432*/
-static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
 static unsigned int rw32[] = {
-	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
 };
-static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
 static unsigned int mw16[] = { 0xB70F, 0xBF0F };
-static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
 static unsigned int mw64[] = {};
 #else /* not __i386__ */
 static unsigned char prefix_codes[] = {
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = {
 static unsigned int reg_rop[] = {
 	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
 };
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
 static unsigned int imm_wop[] = { 0xC6, 0xC7 };
-static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
 static unsigned int rw32[] = {
-	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
 };
 /* 8 bit only */
-static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
 /* 16 bit only */
 static unsigned int mw16[] = { 0xB70F, 0xBF0F };
 /* 16 or 32 bit */
 static unsigned int mw32[] = { 0xC7 };
 /* 16, 32 or 64 bit */
-static unsigned int mw64[] = { 0x89, 0x8B };
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
 #endif /* not __i386__ */
 
 struct prefix_bits {
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
 unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
 {
 	unsigned int opcode;
-	unsigned char mod_rm;
 	int reg;
 	unsigned char *p;
 	struct prefix_bits prf;
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
 	goto err;
 
 do_work:
-	mod_rm = *p;
-	reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+	/* for STOS, source register is fixed */
+	if (opcode == 0xAA || opcode == 0xAB) {
+		reg = arg_AX;
+	} else {
+		unsigned char mod_rm = *p;
+		reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+	}
 	switch (get_ins_reg_width(ins_addr)) {
 	case 1:
 		return *get_reg_w8(reg, prf.rex, regs);
-- 
cgit v1.2.3


From dd180b3e90253cb4ca95d603a8c17413f8daec69 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 3 Jul 2010 16:02:42 +0800
Subject: KVM: VMX: fix tlb flush with invalid root

Commit 341d9b535b6c simplify reload logic while entry guest mode, it
can avoid unnecessary sync-root if KVM_REQ_MMU_RELOAD and
KVM_REQ_MMU_SYNC both set.

But, it cause a issue that when we handle 'KVM_REQ_TLB_FLUSH', the
root is invalid, it is triggered during my test:

Kernel BUG at ffffffffa00212b8 [verbose debug info unavailable]
......

Fixed by directly return if the root is not ready.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/mmu.c              | 2 --
 arch/x86/kvm/vmx.c              | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 50c79b9f5c3..502e53f999c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,6 +40,8 @@
 				  0xFFFFFF0000000000ULL)
 
 #define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 /* KVM Hugepage definitions for x86 */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a20fd613acf..70cdf6876b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -92,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
 #define PT64_LEVEL_BITS 9
 
 #define PT64_LEVEL_SHIFT(level) \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4dfb1dc09c8..2fdcc9819f3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1828,8 +1828,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	vpid_sync_context(to_vmx(vcpu));
-	if (enable_ept)
+	if (enable_ept) {
+		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+			return;
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+	}
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From be38d276b0189fa86231fc311428622a1981ad62 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:31:27 +0300
Subject: KVM: MMU: Introduce drop_spte()

When we call rmap_remove(), we (almost) always immediately follow it by
an __set_spte() to a nonpresent pte.  Since we need to perform the two
operations atomically, to avoid losing the dirty and accessed bits, introduce
a helper drop_spte() and convert all call sites.

The operation is still nonatomic at this point.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 30 +++++++++++++++++-------------
 arch/x86/kvm/paging_tmpl.h | 13 ++++++-------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 70cdf6876b5..1ad39cf70e1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	rmap_remove(kvm, sptep);
+	__set_spte(sptep, new_spte);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				rmap_remove(kvm, spte);
+				drop_spte(kvm, spte,
+					  shadow_trap_nonpresent_pte);
 				--kvm->stat.lpages;
-				__set_spte(spte, shadow_trap_nonpresent_pte);
 				spte = NULL;
 				write_protected = 1;
 			}
@@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		rmap_remove(kvm, spte);
-		__set_spte(spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			rmap_remove(kvm, spte);
-			__set_spte(spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 			} else {
 				if (is_large_pte(ent))
 					--kvm->stat.lpages;
-				rmap_remove(kvm, &pt[i]);
+				drop_spte(kvm, &pt[i],
+					  shadow_trap_nonpresent_pte);
 			}
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
@@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			rmap_remove(vcpu->kvm, sptep);
-			spte = shadow_trap_nonpresent_pte;
-			goto set_pte;
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			goto done;
 		}
 
 		spte |= PT_WRITABLE_MASK;
@@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	__set_spte(sptep, spte);
+done:
 	return ret;
 }
 
@@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
@@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
-			rmap_remove(vcpu->kvm, spte);
+			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 59e750c1a26..796a325c7e5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -353,8 +353,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 
 		if (is_large_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
@@ -516,12 +515,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
+				drop_spte(vcpu->kvm, sptep,
+					  shadow_trap_nonpresent_pte);
 				need_flush = 1;
-			}
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else
+				__set_spte(sptep, shadow_trap_nonpresent_pte);
 			break;
 		}
 
@@ -637,12 +637,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
-			rmap_remove(vcpu->kvm, &sp->spt[i]);
 			if (is_present_gpte(gpte) || !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-			__set_spte(&sp->spt[i], nonpresent);
+			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
 			continue;
 		}
 
-- 
cgit v1.2.3


From ce061867aa2877605cda96fa8ec7dff15f70a983 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:38:12 +0300
Subject: KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
 drop_spte()

Since we need to make the check atomic, move it to the place that will
set the new spte.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1ad39cf70e1..fbdca08b8d8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
-	pfn_t pfn;
 	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
-	if (!is_rmap_spte(*spte))
-		return;
 	sp = page_header(__pa(spte));
-	pfn = spte_to_pfn(*spte);
-	if (*spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*spte))
-		kvm_set_pfn_dirty(pfn);
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
@@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
+	pfn_t pfn;
+
+	if (!is_rmap_spte(*sptep)) {
+		__set_spte(sptep, new_spte);
+		return;
+	}
+	pfn = spte_to_pfn(*sptep);
+	if (*sptep & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (is_writable_pte(*sptep))
+		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
 	__set_spte(sptep, new_spte);
 }
-- 
cgit v1.2.3


From a9221dd5ec125fbec1702fae016c6d2ea1a9a3da Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:48:06 +0300
Subject: KVM: MMU: Atomically check for accessed bit when dropping an spte

Currently, in the window between the check for the accessed bit, and actually
dropping the spte, a vcpu can access the page through the spte and set the bit,
which will be ignored by the mmu.

Fix by using an exchange operation to atmoically fetch the spte and drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbdca08b8d8..ba2efcf2b86 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 }
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+	return xchg(sptep, new_spte);
+#else
+	u64 old_spte;
+
+	do {
+		old_spte = *sptep;
+	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+	return old_spte;
+#endif
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
+	u64 old_spte;
 
-	if (!is_rmap_spte(*sptep)) {
-		__set_spte(sptep, new_spte);
+	old_spte = __xchg_spte(sptep, new_spte);
+	if (!is_rmap_spte(old_spte))
 		return;
-	}
-	pfn = spte_to_pfn(*sptep);
-	if (*sptep & shadow_accessed_mask)
+	pfn = spte_to_pfn(old_spte);
+	if (old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*sptep))
+	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
-	__set_spte(sptep, new_spte);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-- 
cgit v1.2.3


From b79b93f92cb3b66b89d75525fdfd2454b1e1f446 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 15:46:44 +0300
Subject: KVM: MMU: Don't drop accessed bit while updating an spte

__set_spte() will happily replace an spte with the accessed bit set with
one that has the accessed bit clear.  Add a helper update_spte() which checks
for this condition and updates the page flag if needed.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba2efcf2b86..d8d48329cb8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+	u64 old_spte;
+
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+		__set_spte(sptep, new_spte);
+	} else {
+		old_spte = __xchg_spte(sptep, new_spte);
+		if (old_spte & shadow_accessed_mask)
+			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+	}
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 *spte, new_spte, old_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
+			new_spte &= ~shadow_accessed_mask;
 			if (is_writable_pte(*spte))
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			__set_spte(spte, new_spte);
+			old_spte = __xchg_spte(spte, new_spte);
+			if (is_shadow_present_pte(old_spte)
+			    && (old_spte & shadow_accessed_mask))
+				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	__set_spte(sptep, spte);
+	update_spte(sptep, spte);
 done:
 	return ret;
 }
-- 
cgit v1.2.3


From a5046e6c7d97d6574ffe6367311ea0b0de56aa58 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:49:05 +0800
Subject: KVM: x86 emulator: fix 'mov sreg,rm16' instruction decoding

Memory reads for 'mov sreg,rm16' should be 16 bits only.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e8bdddc4509..d842a7d2bc6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -170,7 +170,7 @@ static u32 opcode_table[256] = {
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-	ImplicitOps | SrcMem | ModRM, Group | Group1A,
+	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-- 
cgit v1.2.3


From ce7a0ad3bdcd86e6cf907eb5992fecb1503daa26 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:50:21 +0800
Subject: KVM: x86 emulator: fix the comment of out instruction

Fix the comment of out instruction, using the same style as the
other instructions.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d842a7d2bc6..ad8d7cdd1eb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2949,8 +2949,8 @@ special_insn:
 				     &c->dst.val))
 			goto done; /* IO is needed */
 		break;
-	case 0xee: /* out al,dx */
-	case 0xef: /* out (e/r)ax,dx */
+	case 0xee: /* out dx,al */
+	case 0xef: /* out dx,(e/r)ax */
 		c->src.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
 		c->dst.bytes = min(c->dst.bytes, 4u);
-- 
cgit v1.2.3


From e97e883f8bfbe02cfc2bfff45e68921dfe590c7e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:51:09 +0800
Subject: KVM: x86 emulator: fix 'and AL,imm8' instruction decoding

'and AL,imm8' should be mask as ByteOp, otherwise the dest operand
length will no correct and we may fill the full EAX when writeback.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ad8d7cdd1eb..59568ad21ab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -123,7 +123,7 @@ static u32 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From b16b2b7bb5a78afceb7fe22f2a04476cd70182b7 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:52:53 +0800
Subject: KVM: x86 emulator: fix 'mov rm,sreg' instruction decoding

The source operand of 'mov rm,sreg' is segment register, not
general-purpose register, so remove SrcReg from decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 59568ad21ab..8337567a0f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -169,7 +169,7 @@ static u32 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
+	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
 	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
-- 
cgit v1.2.3


From 07cbc6c185aee2c0479776845988242a040c7c93 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:54:19 +0800
Subject: KVM: x86 emulator: fix cli/sti instruction emulation

If IOPL check fail, the cli/sti emulate GP and then we should
skip writeback since the default write OP is OP_REG.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8337567a0f4..286572a5675 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2979,17 +2979,19 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt, ops))
+		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-		else {
+			goto done;
+		} else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt, ops))
+		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-		else {
+			goto done;
+		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
-- 
cgit v1.2.3


From 5d55f299f97769130c6cc67896414c988db309ab Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 7 Jul 2010 17:43:35 +0800
Subject: KVM: x86 emulator: re-implementing 'mov AL,moffs' instruction
 decoding

This patch change to use DstAcc for decoding 'mov AL, moffs'
and introduced SrcAcc for decoding 'mov moffs, AL'.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 286572a5675..255473f974a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -70,6 +70,7 @@
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
 #define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
+#define SrcAcc      (0xd<<4)	/* Source Accumulator */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -177,8 +178,8 @@ static u32 opcode_table[256] = {
 	0, 0, SrcImmFAddr | No64, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
-	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
@@ -1186,6 +1187,25 @@ done_prefixes:
 		else
 			c->src.val = insn_fetch(u8, 1, c->eip);
 		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->src.bytes) {
+			case 1:
+				c->src.val = *(u8 *)c->src.ptr;
+				break;
+			case 2:
+				c->src.val = *(u16 *)c->src.ptr;
+				break;
+			case 4:
+				c->src.val = *(u32 *)c->src.ptr;
+				break;
+			case 8:
+				c->src.val = *(u64 *)c->src.ptr;
+				break;
+		}
+		break;
 	case SrcOne:
 		c->src.bytes = 1;
 		c->src.val = 1;
@@ -2854,13 +2874,7 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xa0 ... 0xa1:	/* mov */
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		c->dst.val = c->src.val;
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-		break;
+	case 0xa0 ... 0xa3:	/* mov */
 	case 0xa4 ... 0xa5:	/* movs */
 		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
-- 
cgit v1.2.3


From b0eeec29fe7a5b114000f769bd68ffa02652bfb7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Jul 2010 15:40:18 +0300
Subject: KVM: MMU: Only indicate a fetch fault in page fault error code if nx
 is enabled

Bit 4 of the page fault error code is set only if EFER.NX is set.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 796a325c7e5..3a3f6d784d7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -245,7 +245,7 @@ err:
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault)
+	if (fetch_fault && is_nx(vcpu))
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
-- 
cgit v1.2.3


From f59c1d2ded54e4bd7a9126f4a32c9eca8b336457 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Jul 2010 16:20:43 +0300
Subject: KVM: MMU: Keep going on permission error

Real hardware disregards permission errors when computing page fault error
code bit 0 (page present).  Do the same.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 52 ++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3a3f6d784d7..1cea41cad06 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -119,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access;
+	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-	int rsvd_fault = 0;
+	bool eperm, present, rsvd_fault;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
 walk:
+	present = true;
+	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			goto error;
+		}
 		--walker->level;
 	}
 #endif
@@ -151,31 +155,36 @@ walk:
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
-			goto not_present;
+		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+			present = false;
+			break;
+		}
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			break;
+		}
 
-		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
-		if (rsvd_fault)
-			goto access_error;
+		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+			rsvd_fault = true;
+			break;
+		}
 
 		if (write_fault && !is_writable_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
-				goto access_error;
+				eperm = true;
 
 		if (user_fault && !(pte & PT_USER_MASK))
-			goto access_error;
+			eperm = true;
 
 #if PTTYPE == 64
 		if (fetch_fault && (pte & PT64_NX_MASK))
-			goto access_error;
+			eperm = true;
 #endif
 
-		if (!(pte & PT_ACCESSED_MASK)) {
+		if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
@@ -214,6 +223,9 @@ walk:
 		--walker->level;
 	}
 
+	if (!present || eperm || rsvd_fault)
+		goto error;
+
 	if (write_fault && !is_dirty_gpte(pte)) {
 		bool ret;
 
@@ -233,14 +245,10 @@ walk:
 		 __func__, (u64)pte, pte_access, pt_access);
 	return 1;
 
-not_present:
+error:
 	walker->error_code = 0;
-	goto err;
-
-access_error:
-	walker->error_code = PFERR_PRESENT_MASK;
-
-err:
+	if (present)
+		walker->error_code |= PFERR_PRESENT_MASK;
 	if (write_fault)
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
-- 
cgit v1.2.3


From 673813e81d8468e80b6dd0fa839923eb9748dc49 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 7 Jul 2010 15:02:25 +0200
Subject: KVM: fix lock imbalance in kvm_create_pit()

Stanse found that there is an omitted unlock in kvm_create_pit in one fail
path. Add proper unlock there.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Gleb Natapov <gleb@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: kvm@vger.kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 467cc47fb73..70db4d43539 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -696,6 +696,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
 	if (!pit->wq) {
+		mutex_unlock(&pit->pit_state.lock);
 		kfree(pit);
 		return NULL;
 	}
-- 
cgit v1.2.3


From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 7 Jul 2010 20:16:45 +0300
Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area

Currently if guest access address that belongs to memory slot but is not
backed up by page or page is read only KVM treats it like MMIO access.
Remove that capability. It was never part of the interface and should
not be relied upon.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d8d48329cb8..89d7a2cae53 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	if (is_hwpoison_pfn(pfn)) {
 		kvm_send_hwpoison_signal(kvm, gfn);
 		return 0;
-	}
+	} else if (is_fault_pfn(pfn))
+		return -EFAULT;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From a6f177efaa5856e22ed0d3c1e81e65b41654d083 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 8 Jul 2010 12:41:12 +0300
Subject: KVM: Reenter guest after emulation failure if due to access to
 non-mmio address

When shadow pages are in use sometimes KVM try to emulate an instruction
when it accesses a shadowed page. If emulation fails KVM un-shadows the
page and reenter guest to allow vcpu to execute the instruction. If page
is not in shadow page hash KVM assumes that this was attempt to do MMIO
and reports emulation failure to userspace since there is no way to fix
the situation. This logic has a race though. If two vcpus tries to write
to the same shadowed page simultaneously both will enter emulator, but
only one of them will find the page in shadow page hash since the one who
founds it also removes it from there, so another cpu will report failure
to userspace and will abort the guest.

Fix this by checking (in addition to checking shadowed page hash) that
page that caused the emulation belongs to valid memory slot. If it is
then reenter the guest to allow vcpu to reexecute the instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d72fc06705..d51eed239b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3930,6 +3930,29 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return EMULATE_FAIL;
 }
 
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa;
+
+	/*
+	 * if emulation was due to access to shadowed page table
+	 * and it failed try to unshadow page and re-entetr the
+	 * guest to let CPU execute the instruction.
+	 */
+	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+		return true;
+
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+	if (gpa == UNMAPPED_GVA)
+		return true; /* let cpu generate fault */
+
+	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+		return true;
+
+	return false;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3998,7 +4021,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		++vcpu->stat.insn_emulation;
 		if (r)  {
-			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			if (reexecute_instruction(vcpu, cr2))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4019,12 +4042,7 @@ restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
 	if (r) { /* emulation failed */
-		/*
-		 * if emulation was due to access to shadowed page table
-		 * and it failed try to unshadow page and re-entetr the
-		 * guest to let CPU execute the instruction.
-		 */
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
-- 
cgit v1.2.3


From aea924f606c309feead37ab5c43f410a08ff3826 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 10 Jul 2010 17:37:56 +0800
Subject: KVM: PIT: stop vpit before freeing irq_routing

Fix:
general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
......
Call Trace:
 [<ffffffffa0159bd1>] ? kvm_set_irq+0xdd/0x24b [kvm]
 [<ffffffff8106ea8b>] ? trace_hardirqs_off_caller+0x1f/0x10e
 [<ffffffff813ad17f>] ? sub_preempt_count+0xe/0xb6
 [<ffffffff8106d273>] ? put_lock_stats+0xe/0x27
...
RIP  [<ffffffffa0159c72>] kvm_set_irq+0x17e/0x24b [kvm]

This bug is triggered when guest is shutdown, is because we freed
irq_routing before pit thread stopped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 3 +++
 arch/x86/kvm/x86.c   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 70db4d43539..0fd6378981f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -752,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 
 	if (kvm->arch.vpit) {
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+					      &kvm->arch.vpit->speaker_dev);
 		kvm_unregister_irq_mask_notifier(kvm, 0,
 					       &kvm->arch.vpit->mask_notifier);
 		kvm_unregister_irq_ack_notifier(kvm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d51eed239b4..d721e2d81a5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5523,12 +5523,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_sync_events(struct kvm *kvm)
 {
 	kvm_free_all_assigned_devices(kvm);
+	kvm_free_pit(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
-	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-- 
cgit v1.2.3


From 908e75f3e70ca580cc20442cf6780dcc2d0557b7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 7 Jul 2010 14:09:38 +0300
Subject: KVM: Expose MCE control MSRs to userspace

Userspace needs to reset and save/restore these MSRs.

The MCE banks are not exposed since their number varies from vcpu to vcpu.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d721e2d81a5..eb55ec55125 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -744,6 +744,8 @@ static unsigned num_msrs_to_save;
 
 static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
+	MSR_IA32_MCG_STATUS,
+	MSR_IA32_MCG_CTL,
 };
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
-- 
cgit v1.2.3


From 32ef26a3598636be520abed90ed0c2f439d36bbe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:04 +0300
Subject: KVM: MMU: Add link_shadow_page() helper

To simplify the process of fetching an spte, add a helper that links
a shadow page to an spte.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 ++++++++++
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89d7a2cae53..df3a7a79cce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1482,6 +1482,16 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	--iterator->level;
 }
 
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+	u64 spte;
+
+	spte = __pa(sp->spt)
+		| PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+	*sptep = spte;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1cea41cad06..36dc0749c87 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -309,7 +309,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp;
-	u64 spte, *sptep = NULL;
+	u64 *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
 	int r;
@@ -395,10 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			}
 		}
 
-		spte = __pa(sp->spt)
-			| PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		*sptep = spte;
+		link_shadow_page(sptep, sp);
 	}
 
 	return sptep;
-- 
cgit v1.2.3


From 121eee97a7802acda8b78436cc53196e9885549f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:05 +0300
Subject: KVM: MMU: Use __set_spte to link shadow pages

To avoid split accesses to 64 bit sptes on i386, use __set_spte() to link
shadow pages together.

(not technically required since shadow pages are __GFP_KERNEL, so upper 32
bits are always clear)

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index df3a7a79cce..5a6019a534a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1489,7 +1489,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	spte = __pa(sp->spt)
 		| PT_PRESENT_MASK | PT_ACCESSED_MASK
 		| PT_WRITABLE_MASK | PT_USER_MASK;
-	*sptep = spte;
+	__set_spte(sptep, spte);
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-- 
cgit v1.2.3


From a3aa51cfaafe9179add88db20506ccb07e030b47 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:06 +0300
Subject: KVM: MMU: Add drop_large_spte() helper

To clarify spte fetching code, move large spte handling into a helper.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 8 ++++++++
 arch/x86/kvm/paging_tmpl.h | 5 +----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5a6019a534a..b75d6cb44ab 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1492,6 +1492,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	__set_spte(sptep, spte);
 }
 
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	if (is_large_pte(*sptep)) {
+		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 36dc0749c87..0fb7068d64c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -360,10 +360,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
-		if (is_large_pte(*sptep)) {
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		}
+		drop_large_spte(vcpu, sptep);
 
 		if (level <= gw->level) {
 			direct = 1;
-- 
cgit v1.2.3


From a357bd229cdaf37a41798d238ab50b34c71dd0d6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:07 +0300
Subject: KVM: MMU: Add validate_direct_spte() helper

Add a helper to verify that a direct shadow page is valid wrt the required
access permissions; drop the page if it is not valid.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 23 +++++++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h | 27 ++++++---------------------
 2 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b75d6cb44ab..36c62f33513 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1500,6 +1500,29 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 	}
 }
 
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+				   unsigned direct_access)
+{
+	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+		struct kvm_mmu_page *child;
+
+		/*
+		 * For the direct sp, if the guest pte's dirty bit
+		 * changed form clean to dirty, it will corrupt the
+		 * sp's access: allow writable in the read-only sp,
+		 * so we should update the spte at this point to get
+		 * a new sp with the correct access.
+		 */
+		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+		if (child->role.access == direct_access)
+			return;
+
+		mmu_page_remove_parent_pte(child, sptep);
+		__set_spte(sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0fb7068d64c..0c7461d3a5b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -338,30 +338,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			break;
 		}
 
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
-			struct kvm_mmu_page *child;
-
-			if (level != gw->level)
-				continue;
-
-			/*
-			 * For the direct sp, if the guest pte's dirty bit
-			 * changed form clean to dirty, it will corrupt the
-			 * sp's access: allow writable in the read-only sp,
-			 * so we should update the spte at this point to get
-			 * a new sp with the correct access.
-			 */
-			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
-			if (child->role.access == direct_access)
-				continue;
-
-			mmu_page_remove_parent_pte(child, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		}
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)
+		    && level == gw->level)
+			validate_direct_spte(vcpu, sptep, direct_access);
 
 		drop_large_spte(vcpu, sptep);
 
+		if (is_shadow_present_pte(*sptep))
+			continue;
+
 		if (level <= gw->level) {
 			direct = 1;
 			access = direct_access;
-- 
cgit v1.2.3


From 39c8c672a18c52048343d7531dfb2dcf3431ee74 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:08 +0300
Subject: KVM: MMU: Add gpte_valid() helper

Move the code to check whether a gpte has changed since we fetched it into
a helper.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0c7461d3a5b..e1c1f9eb1cc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,6 +299,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+				struct guest_walker *gw, int level)
+{
+	int r;
+	pt_element_t curr_pte;
+
+	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+				  &curr_pte, sizeof(curr_pte));
+	return r || curr_pte != gw->ptes[level - 1];
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -312,11 +323,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	u64 *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
-	int r;
 	int level;
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	unsigned direct_access;
-	pt_element_t curr_pte;
 	struct kvm_shadow_walk_iterator iterator;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
@@ -365,17 +374,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 					       direct, access, sptep);
-		if (!direct) {
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gw->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != gw->ptes[level - 2]) {
+		if (!direct)
+			/*
+			 * Verify that the gpte in the page we've just write
+			 * protected is still there.
+			 */
+			if (FNAME(gpte_changed)(vcpu, gw, level - 1)) {
 				kvm_mmu_put_page(sp, sptep);
 				kvm_release_pfn_clean(pfn);
 				sptep = NULL;
 				break;
 			}
-		}
 
 		link_shadow_page(sptep, sp);
 	}
-- 
cgit v1.2.3


From 0b3c933302262d83018dd5f69656bca9f28a0cd3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:09 +0300
Subject: KVM: MMU: Simplify spte fetch() function

Partition the function into three sections:

- fetching indirect shadow pages (host_level > guest_level)
- fetching direct shadow pages (page_level < host_level <= guest_level)
- the final spte (page_level == host_level)

Instead of the current spaghetti.

A slight change from the original code is that we call validate_direct_spte()
more often: previously we called it only for gw->level, now we also call it for
lower levels.  The change should have no effect.

[xiao: fix regression caused by validate_direct_spte() called too late]

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 93 ++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e1c1f9eb1cc..368e4cb6233 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -321,9 +321,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp;
 	u64 *sptep = NULL;
-	int direct;
-	gfn_t table_gfn;
-	int level;
+	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator iterator;
@@ -335,61 +333,68 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!dirty)
 		direct_access &= ~ACC_WRITE_MASK;
 
-	for_each_shadow_entry(vcpu, addr, iterator) {
+	for (shadow_walk_init(&iterator, vcpu, addr);
+	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
+	     shadow_walk_next(&iterator)) {
+		gfn_t table_gfn;
+
 		level = iterator.level;
 		sptep = iterator.sptep;
-		if (iterator.level == hlevel) {
-			mmu_set_spte(vcpu, sptep, access,
-				     gw->pte_access & access,
-				     user_fault, write_fault,
-				     dirty, ptwrite, level,
-				     gw->gfn, pfn, false, true);
-			break;
-		}
-
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)
-		    && level == gw->level)
-			validate_direct_spte(vcpu, sptep, direct_access);
 
 		drop_large_spte(vcpu, sptep);
 
 		if (is_shadow_present_pte(*sptep))
 			continue;
 
-		if (level <= gw->level) {
-			direct = 1;
-			access = direct_access;
-
-			/*
-			 * It is a large guest pages backed by small host pages,
-			 * So we set @direct(@sp->role.direct)=1, and set
-			 * @table_gfn(@sp->gfn)=the base page frame for linear
-			 * translations.
-			 */
-			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-			access &= gw->pte_access;
-		} else {
-			direct = 0;
-			table_gfn = gw->table_gfn[level - 2];
-		}
+		table_gfn = gw->table_gfn[level - 2];
 		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       direct, access, sptep);
-		if (!direct)
-			/*
-			 * Verify that the gpte in the page we've just write
-			 * protected is still there.
-			 */
-			if (FNAME(gpte_changed)(vcpu, gw, level - 1)) {
-				kvm_mmu_put_page(sp, sptep);
-				kvm_release_pfn_clean(pfn);
-				sptep = NULL;
-				break;
-			}
+				      false, access, sptep);
+
+		/*
+		 * Verify that the gpte in the page we've just write
+		 * protected is still there.
+		 */
+		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
+			goto out_gpte_changed;
+
+		link_shadow_page(sptep, sp);
+	}
+
+	for (;
+	     shadow_walk_okay(&iterator) && iterator.level > hlevel;
+	     shadow_walk_next(&iterator)) {
+		gfn_t direct_gfn;
 
+		level = iterator.level;
+		sptep = iterator.sptep;
+
+		validate_direct_spte(vcpu, sptep, direct_access);
+
+		drop_large_spte(vcpu, sptep);
+
+		if (is_shadow_present_pte(*sptep))
+			continue;
+
+		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1,
+				      true, direct_access, sptep);
 		link_shadow_page(sptep, sp);
 	}
 
+	sptep = iterator.sptep;
+	level = iterator.level;
+
+	mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+		     user_fault, write_fault, dirty, ptwrite, level,
+		     gw->gfn, pfn, false, true);
+
 	return sptep;
+
+out_gpte_changed:
+	kvm_mmu_put_page(sp, sptep);
+	kvm_release_pfn_clean(pfn);
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 5991b33237b7fc7dd9f62ae04998c42217d444a7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:10 +0300
Subject: KVM: MMU: Validate all gptes during fetch, not just those used for
 new pages

Currently, when we fetch an spte, we only verify that gptes match those that
the walker saw if we build new shadow pages for them.

However, this misses the following race:

  vcpu1            vcpu2

  walk
                  change gpte
                  walk
                  instantiate sp

  fetch existing sp

Fix by validating every gpte, regardless of whether it is used for building
a new sp or not.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 368e4cb6233..8cb85f9c8ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -319,10 +319,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 int *ptwrite, pfn_t pfn)
 {
 	unsigned access = gw->pt_access;
-	struct kvm_mmu_page *sp;
+	struct kvm_mmu_page *sp = NULL;
 	u64 *sptep = NULL;
 	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+	int top_level;
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator iterator;
 
@@ -333,6 +334,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!dirty)
 		direct_access &= ~ACC_WRITE_MASK;
 
+	top_level = vcpu->arch.mmu.root_level;
+	if (top_level == PT32E_ROOT_LEVEL)
+		top_level = PT32_ROOT_LEVEL;
+	/*
+	 * Verify that the top-level gpte is still there.  Since the page
+	 * is a root page, it is either write protected (and cannot be
+	 * changed from now on) or it is invalid (in which case, we don't
+	 * really care if it changes underneath us after this point).
+	 */
+	if (FNAME(gpte_changed)(vcpu, gw, top_level))
+		goto out_gpte_changed;
+
 	for (shadow_walk_init(&iterator, vcpu, addr);
 	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
 	     shadow_walk_next(&iterator)) {
@@ -343,12 +356,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 		drop_large_spte(vcpu, sptep);
 
-		if (is_shadow_present_pte(*sptep))
-			continue;
-
-		table_gfn = gw->table_gfn[level - 2];
-		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-				      false, access, sptep);
+		sp = NULL;
+		if (!is_shadow_present_pte(*sptep)) {
+			table_gfn = gw->table_gfn[level - 2];
+			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					      false, access, sptep);
+		}
 
 		/*
 		 * Verify that the gpte in the page we've just write
@@ -357,7 +370,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
 			goto out_gpte_changed;
 
-		link_shadow_page(sptep, sp);
+		if (sp)
+			link_shadow_page(sptep, sp);
 	}
 
 	for (;
@@ -392,7 +406,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	return sptep;
 
 out_gpte_changed:
-	kvm_mmu_put_page(sp, sptep);
+	if (sp)
+		kvm_mmu_put_page(sp, sptep);
 	kvm_release_pfn_clean(pfn);
 	return NULL;
 }
-- 
cgit v1.2.3


From 24157aaf833261e68e5a398fa54bd15e4fa1d0b7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:11 +0300
Subject: KVM: MMU: Eliminate redundant temporaries in FNAME(fetch)

'level' and 'sptep' are aliases for 'interator.level' and 'iterator.sptep', no
need for them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 59 +++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8cb85f9c8ad..d9a2742014e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -320,12 +320,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
-	u64 *sptep = NULL;
-	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	int top_level;
 	unsigned direct_access;
-	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_shadow_walk_iterator it;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
@@ -346,68 +344,59 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (FNAME(gpte_changed)(vcpu, gw, top_level))
 		goto out_gpte_changed;
 
-	for (shadow_walk_init(&iterator, vcpu, addr);
-	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
-	     shadow_walk_next(&iterator)) {
+	for (shadow_walk_init(&it, vcpu, addr);
+	     shadow_walk_okay(&it) && it.level > gw->level;
+	     shadow_walk_next(&it)) {
 		gfn_t table_gfn;
 
-		level = iterator.level;
-		sptep = iterator.sptep;
-
-		drop_large_spte(vcpu, sptep);
+		drop_large_spte(vcpu, it.sptep);
 
 		sp = NULL;
-		if (!is_shadow_present_pte(*sptep)) {
-			table_gfn = gw->table_gfn[level - 2];
-			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					      false, access, sptep);
+		if (!is_shadow_present_pte(*it.sptep)) {
+			table_gfn = gw->table_gfn[it.level - 2];
+			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+					      false, access, it.sptep);
 		}
 
 		/*
 		 * Verify that the gpte in the page we've just write
 		 * protected is still there.
 		 */
-		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
+		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
 			goto out_gpte_changed;
 
 		if (sp)
-			link_shadow_page(sptep, sp);
+			link_shadow_page(it.sptep, sp);
 	}
 
 	for (;
-	     shadow_walk_okay(&iterator) && iterator.level > hlevel;
-	     shadow_walk_next(&iterator)) {
+	     shadow_walk_okay(&it) && it.level > hlevel;
+	     shadow_walk_next(&it)) {
 		gfn_t direct_gfn;
 
-		level = iterator.level;
-		sptep = iterator.sptep;
+		validate_direct_spte(vcpu, it.sptep, direct_access);
 
-		validate_direct_spte(vcpu, sptep, direct_access);
+		drop_large_spte(vcpu, it.sptep);
 
-		drop_large_spte(vcpu, sptep);
-
-		if (is_shadow_present_pte(*sptep))
+		if (is_shadow_present_pte(*it.sptep))
 			continue;
 
-		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 
-		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1,
-				      true, direct_access, sptep);
-		link_shadow_page(sptep, sp);
+		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+				      true, direct_access, it.sptep);
+		link_shadow_page(it.sptep, sp);
 	}
 
-	sptep = iterator.sptep;
-	level = iterator.level;
-
-	mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
-		     user_fault, write_fault, dirty, ptwrite, level,
+	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
 
-	return sptep;
+	return it.sptep;
 
 out_gpte_changed:
 	if (sp)
-		kvm_mmu_put_page(sp, sptep);
+		kvm_mmu_put_page(sp, it.sptep);
 	kvm_release_pfn_clean(pfn);
 	return NULL;
 }
-- 
cgit v1.2.3


From c0e0608cb902af1a1fd8d413ec0a07ee1e62c652 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 13 Jul 2010 16:40:23 +0300
Subject: KVM: x86: emulator: inc/dec can have lock prefix

Mark inc (0xfe/0 0xff/0) and dec (0xfe/1 0xff/1) as lock prefix capable.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 255473f974a..b38bd8b92aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -345,10 +345,10 @@ static u32 group_table[] = {
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0, 0, 0,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
 	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
 	SrcMem | ModRM | Stack, 0,
-- 
cgit v1.2.3


From 68be0803456b3eed33038be5566710ad7648c854 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 14 Jul 2010 19:05:45 +0300
Subject: KVM: x86: never re-execute instruction with enabled tdp

With tdp enabled we should get into emulator only when emulating io, so
reexecution will always bring us back into emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb55ec55125..689c2c3182a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3936,6 +3936,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	gpa_t gpa;
 
+	if (tdp_enabled)
+		return false;
+
 	/*
 	 * if emulation was due to access to shadowed page table
 	 * and it failed try to unshadow page and re-entetr the
-- 
cgit v1.2.3


From 9195c4da26bbf8860e2e7b648dbf4ab465c7933a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 15 Jul 2010 12:24:37 +0300
Subject: KVM: x86: Call mask notifiers from pic

If pit delivers interrupt while pic is masking it OS will never do EOI
and ack notifier will not be called so when pit will be unmasked no pit
interrupts will be delivered any more. Calling mask notifiers solves this
issue.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 819b748a33f..8d10c063d7f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -363,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 		}
 	} else
 		switch (s->init_state) {
-		case 0:		/* normal mode */
+		case 0: { /* normal mode */
+			u8 imr_diff = s->imr ^ val,
+				off = (s == &s->pics_state->pics[0]) ? 0 : 8;
 			s->imr = val;
+			for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+				if (imr_diff & (1 << irq))
+					kvm_fire_mask_notifiers(
+						s->pics_state->kvm,
+						SELECT_PIC(irq + off),
+						irq + off,
+						!!(s->imr & (1 << irq)));
 			pic_update_irq(s->pics_state);
 			break;
+		}
 		case 1:
 			s->irq_base = val & 0xf8;
 			s->init_state = 2;
-- 
cgit v1.2.3


From c19b8bd60e19308d5583ef200ddcc782d85d9543 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 15 Jul 2010 08:51:58 +0800
Subject: KVM: x86 emulator: fix xchg instruction emulation

If the destination is a memory operand and the memory cannot
map to a valid page, the xchg instruction emulation and locked
instruction will not work on io regions and stuck in endless
loop. We should emulate exchange as write to fix it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 689c2c3182a..97aab036dab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3562,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		goto emul_write;
 
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		goto emul_write;
+	}
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	kaddr += offset_in_page(gpa);
-- 
cgit v1.2.3


From 6e3e243c3b6e0bbd18c6ce0fbc12bc3fe2d77b34 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 16 Jul 2010 11:52:55 +0200
Subject: KVM: MMU: fix mmu notifier invalidate handler for huge spte

The index wasn't calculated correctly (off by one) for huge spte so KVM guest
was unstable with transparent hugepages.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c62f33513..812770cddc8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -850,8 +850,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				int idx = gfn_offset;
-				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+				unsigned long idx;
+				int sh;
+
+				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+					(memslot->base_gfn >> sh);
 				ret |= handler(kvm,
 					&memslot->lpage_info[j][idx].rmap_pde,
 					data);
-- 
cgit v1.2.3


From fa1de2bfc0feb7245328ad25fb3e6d5cd2c903b4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:19:51 +0800
Subject: KVM: MMU: add missing reserved bits check in speculative path

In the speculative path, we should check guest pte's reserved bits just as
the real processor does

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 9 ++++++++-
 arch/x86/kvm/paging_tmpl.h | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 812770cddc8..d2ea9cabc06 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2697,6 +2697,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
+	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+		return;
+
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
@@ -2775,6 +2778,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       bool guest_initiated)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	union kvm_mmu_page_role mask = { .word = 0 };
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
@@ -2849,6 +2853,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 
+	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -2896,7 +2901,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (gentry)
+			if (gentry &&
+			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+			      & mask.word))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (!remote_flush && need_remote_flush(entry, *spte))
 				remote_flush = true;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d9a2742014e..51ef9097960 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -638,8 +638,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (gfn != sp->gfns[i] ||
-		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
+		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
 			if (is_present_gpte(gpte) || !clear_unsync)
-- 
cgit v1.2.3


From daa3db693ce925a14b7e17ab6f306dc0e6a5342c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:23:04 +0800
Subject: KVM: MMU: fix broken page accessed tracking with ept enabled

In current code, if ept is enabled(shadow_accessed_mask = 0), the page
accessed tracking is lost.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2ea9cabc06..9b3b916ebea 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -687,7 +687,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 	if (!is_rmap_spte(old_spte))
 		return;
 	pfn = spte_to_pfn(old_spte);
-	if (old_spte & shadow_accessed_mask)
+	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
@@ -815,7 +815,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
 			old_spte = __xchg_spte(spte, new_spte);
 			if (is_shadow_present_pte(old_spte)
-			    && (old_spte & shadow_accessed_mask))
+			    && (!shadow_accessed_mask ||
+			    old_spte & shadow_accessed_mask))
 				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
-- 
cgit v1.2.3


From 9ed5520dd3c9cb79c25f95fce9c57b87637d0fb7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:25:17 +0800
Subject: KVM: MMU: fix page dirty tracking lost while sync page

In sync-page path, if spte.writable is changed, it will lose page dirty
tracking, for example:

assume spte.writable = 0 in a unsync-page, when it's synced, it map spte
to writable(that is spte.writable = 1), later guest write spte.gfn, it means
spte.gfn is dirty, then guest changed this mapping to read-only, after it's
synced,  spte.writable = 0

So, when host release the spte, it detect spte.writable = 0 and not mark page
dirty

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b3b916ebea..a04756a26fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1985,6 +1985,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
+	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
@@ -1998,7 +2000,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 bool reset_host_protection)
 {
 	int was_rmapped = 0;
-	int was_writable = is_writable_pte(*sptep);
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
@@ -2048,15 +2049,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	page_header_update_slot(vcpu->kvm, sptep, gfn);
 	if (!was_rmapped) {
 		rmap_count = rmap_add(vcpu, sptep, gfn);
-		kvm_release_pfn_clean(pfn);
 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
 			rmap_recycle(vcpu, sptep, gfn);
-	} else {
-		if (was_writable)
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
 	}
+	kvm_release_pfn_clean(pfn);
 	if (speculative) {
 		vcpu->arch.last_pte_updated = sptep;
 		vcpu->arch.last_pte_gfn = gfn;
-- 
cgit v1.2.3


From be233d49ea8c1fde9f4afec378dc2c2f16ab0263 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:27:10 +0800
Subject: KVM: MMU: don't atomicly set spte if it's not present

If the old mapping is not present, the spte.a is not lost, so no need
atomic operation to set it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a04756a26fe..9c7fae08291 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -307,9 +307,10 @@ static void update_spte(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte;
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+	      !is_rmap_spte(*sptep))
 		__set_spte(sptep, new_spte);
-	} else {
+	else {
 		old_spte = __xchg_spte(sptep, new_spte);
 		if (old_spte & shadow_accessed_mask)
 			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
-- 
cgit v1.2.3


From e4b502ead259fcf70839414abb7c8cdc3b523f01 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:28:09 +0800
Subject: KVM: MMU: cleanup spte set and accssed/dirty tracking

Introduce set_spte_track_bits() to cleanup current code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c7fae08291..e4b862eb888 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -679,7 +679,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
 	u64 old_spte;
@@ -692,6 +692,11 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	set_spte_track_bits(sptep, new_spte);
 	rmap_remove(kvm, sptep);
 }
 
@@ -791,7 +796,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte, old_spte;
+	u64 *spte, new_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -812,13 +817,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			if (is_writable_pte(*spte))
-				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			old_spte = __xchg_spte(spte, new_spte);
-			if (is_shadow_present_pte(old_spte)
-			    && (!shadow_accessed_mask ||
-			    old_spte & shadow_accessed_mask))
-				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+			set_spte_track_bits(spte, new_spte);
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
-- 
cgit v1.2.3


From 9a3aad70572c3f4d55e7f09ac4eb313d41d0a484 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:30:18 +0800
Subject: KVM: MMU: using __xchg_spte more smarter

Sometimes, atomically set spte is not needed, this patch call __xchg_spte()
more smartly

Note: if the old mapping's access bit is already set, we no need atomic operation
since the access bit is not lost

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e4b862eb888..0dcc95e0987 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -682,9 +682,14 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
-	u64 old_spte;
+	u64 old_spte = *sptep;
+
+	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+	      old_spte & shadow_accessed_mask) {
+		__set_spte(sptep, new_spte);
+	} else
+		old_spte = __xchg_spte(sptep, new_spte);
 
-	old_spte = __xchg_spte(sptep, new_spte);
 	if (!is_rmap_spte(old_spte))
 		return;
 	pfn = spte_to_pfn(old_spte);
-- 
cgit v1.2.3


From 3444d7da1839b851eefedd372978d8a982316c36 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 18:32:38 +0300
Subject: KVM: VMX: Fix host GDT.LIMIT corruption

vmx does not restore GDT.LIMIT to the host value, instead it sets it to 64KB.
This means host userspace can learn a few bits of host memory.

Fix by reloading GDTR when we load other host state.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2fdcc9819f3..27a0222c294 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,6 +185,7 @@ static void kvm_cpu_vmxoff(void);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
@@ -871,6 +872,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #endif
 	if (current_thread_info()->status & TS_USEDFPU)
 		clts();
+	load_gdt(&__get_cpu_var(host_gdt));
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -1379,6 +1381,8 @@ static int hardware_enable(void *garbage)
 		ept_sync_global();
 	}
 
+	store_gdt(&__get_cpu_var(host_gdt));
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From e8c534ec068af1a0845aceda373a9bfd2de62030 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Tue, 27 Jul 2010 18:53:35 +0200
Subject: x86: Fix keeping track of AMD C1E

Accomodate the original C1E-aware idle routine to the different times
during boot when the BIOS enables C1E. While at it, remove the synthetic
CPUID flag in favor of a single global setting which denotes C1E status
on the system.

[ hpa: changed c1e_enabled to be a bool; clarified cpu bit 3:21 comment ]

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
LKML-Reference: <20100727165335.GA11630@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/acpi.h       | 2 +-
 arch/x86/include/asm/cpufeature.h | 2 +-
 arch/x86/include/asm/processor.h  | 1 +
 arch/x86/kernel/process.c         | 8 +++++---
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968f..92091de1111 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 	    boot_cpu_data.x86_model <= 0x05 &&
 	    boot_cpu_data.x86_mask < 0x0A)
 		return 1;
-	else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+	else if (c1e_detected)
 		return 1;
 	else
 		return max_cstate;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 817aa316b18..0b205b8a430 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -89,7 +89,7 @@
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
 #define X86_FEATURE_11AP	(3*32+19) /* "" Bad local APIC aka 11AP */
 #define X86_FEATURE_NOPL	(3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_AMDC1E	(3*32+21) /* AMD C1E detected */
+					  /* 21 available, was AMD_C1E */
 #define X86_FEATURE_XTOPOLOGY	(3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d85637bb950..325b7bdbeba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
 extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
 extern unsigned long		idle_nomwait;
+extern bool			c1e_detected;
 
 /*
  * on systems with caches, caches must be flashed as the absolute
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 553b02f1309..b944f89c4e6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -525,8 +525,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
+bool c1e_detected;
+EXPORT_SYMBOL(c1e_detected);
+
 static cpumask_var_t c1e_mask;
-static int c1e_detected;
 
 void c1e_remove_cpu(int cpu)
 {
@@ -548,12 +550,12 @@ static void c1e_idle(void)
 		u32 lo, hi;
 
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			c1e_detected = 1;
+			c1e_detected = true;
 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
 			printk(KERN_INFO "System has AMD C1E enabled\n");
-			set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
 		}
 	}
 
-- 
cgit v1.2.3


From fe96eb404e33b59bb39f7050205f7c56c1c7d686 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 18 Mar 2010 13:53:24 -0400
Subject: x86: Detect whether we should use Xen SWIOTLB.

It is paramount that we call pci_xen_swiotlb_detect before
pci_swiotlb_detect as both implementations use the 'swiotlb'
and 'swiotlb_force' flags. The pci-xen_swiotlb_detect inhibits
the swiotlb_force and swiotlb flag so that the native SWIOTLB
implementation is not enabled when running under Xen.

[since v1 changed two Cc's to Acked-by]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    [http://lkml.org/lkml/2010/7/27/374]
Cc: Albert Herranz <albert_herranz@yahoo.es>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
    [conditional http://lkml.org/lkml/2010/8/2/324]
Cc: x86@kernel.org
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/pci-dma.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01d..9f07cfcbd3a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 #include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
 
 static int forbid_dac __read_mostly;
 
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
 
-	if (pci_swiotlb_detect())
+	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
 		goto out;
 
 	gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
 	/* needs to be called after gart_iommu_hole_init */
 	amd_iommu_detect();
 out:
+	pci_xen_swiotlb_init();
+
 	pci_swiotlb_init();
 }
 
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
 #endif
 	x86_init.iommu.iommu_init();
 
-	if (swiotlb) {
+	if (swiotlb || xen_swiotlb) {
 		printk(KERN_INFO "PCI-DMA: "
 		       "Using software bounce buffering for IO (SWIOTLB)\n");
 		swiotlb_print_info();
-- 
cgit v1.2.3


From be783a47214afc5a0aea9dafcbd9f1535ba05e94 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 2 Aug 2010 08:49:34 +0800
Subject: x86, vdso: Unmap vdso pages

We mapped vdso pages but never unmapped them and the virtual address
is lost after exiting from the function, so unmap vdso pages here.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <20100802004934.GA2505@sli10-desk.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/vdso/vma.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b814..80f23ed483e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
 	*(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
 #include "vextern.h"
 #undef VEXTERN
+	vunmap(vbase);
 	return 0;
 
  oom:
-- 
cgit v1.2.3


From 22a57f5896df218356bae6203dfaf04bcfd6c88c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 2 Aug 2010 15:34:44 -0700
Subject: x86, setup: Allow global variables and functions in the decompressor

In order for global variables and functions to work in the
decompressor, we need to fix up the GOT in assembly code.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <4C57382E.8050501@zytor.com>
---
 arch/x86/boot/compressed/head_32.S     | 13 +++++++++++++
 arch/x86/boot/compressed/head_64.S     | 13 +++++++++++++
 arch/x86/boot/compressed/vmlinux.lds.S |  6 ++++++
 3 files changed, 32 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae..67a655a39ce 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -123,6 +123,19 @@ relocated:
 	shrl	$2, %ecx
 	rep	stosl
 
+/*
+ * Adjust our own GOT
+ */
+	leal	_got(%ebx), %edx
+	leal	_egot(%ebx), %ecx
+1:
+	cmpl	%ecx, %edx
+	jae	2f
+	addl	%ebx, (%edx)
+	addl	$4, %edx
+	jmp	1b
+2:
+
 /*
  * Do the decompression, and jump to the new kernel..
  */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06..52f85a196fa 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -279,6 +279,19 @@ relocated:
 	shrq	$3, %rcx
 	rep	stosq
 
+/*
+ * Adjust our own GOT
+ */
+	leaq	_got(%rip), %rdx
+	leaq	_egot(%rip), %rcx
+1:
+	cmpq	%rcx, %rdx
+	jae	2f
+	addq	%rbx, (%rdx)
+	addq	$8, %rdx
+	jmp	1b
+2:
+	
 /*
  * Do the decompression, and jump to the new kernel..
  */
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee12..34d047c9828 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
 		*(.rodata.*)
 		_erodata = . ;
 	}
+	.got : {
+		_got = .;
+		KEEP(*(.got.plt))
+		KEEP(*(.got))
+		_egot = .;
+	}
 	.data :	{
 		_data = . ;
 		*(.data)
-- 
cgit v1.2.3


From f4ed2877b16e8146427306aea8819adac5c88374 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Aug 2010 02:17:31 -0700
Subject: x86, setup: reorganize the early console setup

Separate early_serial_console from tty.c

This allows for reuse of
early_serial_console.c/string.c/printf.c/cmdline.c in boot/compressed/.

-v2: according to hpa, don't include string.c etc
-v3: compressed/misc.c must have early_serial_base as static, so move it back to tty.c
     for setup code

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4C568D2B.205@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/Makefile               |   8 +-
 arch/x86/boot/boot.h                 |  35 +++++----
 arch/x86/boot/cmdline.c              |   6 +-
 arch/x86/boot/early_serial_console.c | 139 +++++++++++++++++++++++++++++++++++
 arch/x86/boot/isdigit.h              |  21 ++++++
 arch/x86/boot/printf.c               |   4 +-
 arch/x86/boot/tty.c                  | 136 +---------------------------------
 7 files changed, 186 insertions(+), 163 deletions(-)
 create mode 100644 arch/x86/boot/early_serial_console.c
 create mode 100644 arch/x86/boot/isdigit.h

(limited to 'arch/x86')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd..f7cb086b4ad 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets		:= vmlinux.bin setup.bin setup.elf bzImage
 targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
-setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
-setup-y		+= header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y		+= printf.o regs.o string.o tty.o video.o video-mode.o
-setup-y		+= version.o
+setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
+setup-y		+= early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y		+= pm.o pmjump.o printf.o regs.o string.o tty.o video.o
+setup-y		+= video-mode.o version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
 
 # The link order of the video-*.o modules can matter.  In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 46c4c5c71af..00cf51cfc2e 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -200,21 +200,7 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
 	return diff;
 }
 
-static inline int isdigit(int ch)
-{
-	return (ch >= '0') && (ch <= '9');
-}
-
-static inline int isxdigit(int ch)
-{
-	if (isdigit(ch))
-		return true;
-
-	if ((ch >= 'a') && (ch <= 'f'))
-		return true;
-
-	return (ch >= 'A') && (ch <= 'F');
-}
+#include "isdigit.h"
 
 /* Heap -- available for dynamic lists. */
 extern char _end[];
@@ -300,8 +286,18 @@ struct biosregs {
 void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
 
 /* cmdline.c */
-int cmdline_find_option(const char *option, char *buffer, int bufsize);
-int cmdline_find_option_bool(const char *option);
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+	return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+
+static inline int cmdline_find_option_bool(const char *option)
+{
+	return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+}
+
 
 /* cpu.c, cpucheck.c */
 struct cpu_features {
@@ -313,6 +309,10 @@ extern struct cpu_features cpu;
 int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
 int validate_cpu(void);
 
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
 /* edd.c */
 void query_edd(void);
 
@@ -348,7 +348,6 @@ unsigned int atou(const char *s);
 unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
 
 /* tty.c */
-void console_init(void);
 void puts(const char *);
 void putchar(int);
 int getchar(void);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce..6b3b6f708c0 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
  * Returns the length of the argument (regardless of if it was
  * truncated to fit in the buffer), or -1 on not found.
  */
-int cmdline_find_option(const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
 {
-	u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
 	addr_t cptr;
 	char c;
 	int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
  * Returns the position of that option (starts counting with 1)
  * or 0 on not found
  */
-int cmdline_find_option_bool(const char *option)
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
 {
-	u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
 	addr_t cptr;
 	char c;
 	int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 00000000000..030f4b93e25
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,139 @@
+#include "boot.h"
+
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+#define XMTRDY          0x20
+
+#define DLAB		0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+#define DEFAULT_BAUD 9600
+
+static void early_serial_init(int port, int baud)
+{
+	unsigned char c;
+	unsigned divisor;
+
+	outb(0x3, port + LCR);	/* 8n1 */
+	outb(0, port + IER);	/* no interrupt */
+	outb(0, port + FCR);	/* no fifo */
+	outb(0x3, port + MCR);	/* DTR + RTS */
+
+	divisor	= 115200 / baud;
+	c = inb(port + LCR);
+	outb(c | DLAB, port + LCR);
+	outb(divisor & 0xff, port + DLL);
+	outb((divisor >> 8) & 0xff, port + DLH);
+	outb(c & ~DLAB, port + LCR);
+
+	early_serial_base = port;
+}
+
+static void parse_earlyprintk(void)
+{
+	int baud = DEFAULT_BAUD;
+	char arg[32];
+	int pos = 0;
+	int port = 0;
+
+	if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
+		char *e;
+
+		if (!strncmp(arg, "serial", 6)) {
+			port = DEFAULT_SERIAL_PORT;
+			pos += 6;
+		}
+
+		if (arg[pos] == ',')
+			pos++;
+
+		if (!strncmp(arg, "ttyS", 4)) {
+			static const int bases[] = { 0x3f8, 0x2f8 };
+			int idx = 0;
+
+			if (!strncmp(arg + pos, "ttyS", 4))
+				pos += 4;
+
+			if (arg[pos++] == '1')
+				idx = 1;
+
+			port = bases[idx];
+		}
+
+		if (arg[pos] == ',')
+			pos++;
+
+		baud = simple_strtoull(arg + pos, &e, 0);
+		if (baud == 0 || arg + pos == e)
+			baud = DEFAULT_BAUD;
+	}
+
+	if (port)
+		early_serial_init(port, baud);
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+	unsigned char lcr, dll, dlh;
+	unsigned int quot;
+
+	lcr = inb(port + LCR);
+	outb(lcr | DLAB, port + LCR);
+	dll = inb(port + DLL);
+	dlh = inb(port + DLH);
+	outb(lcr, port + LCR);
+	quot = (dlh << 8) | dll;
+
+	return BASE_BAUD / quot;
+}
+
+static void parse_console_uart8250(void)
+{
+	char optstr[64], *options;
+	int baud = DEFAULT_BAUD;
+	int port = 0;
+
+	/*
+	 * console=uart8250,io,0x3f8,115200n8
+	 * need to make sure it is last one console !
+	 */
+	if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
+		return;
+
+	options = optstr;
+
+	if (!strncmp(options, "uart8250,io,", 12))
+		port = simple_strtoull(options + 12, &options, 0);
+	else if (!strncmp(options, "uart,io,", 8))
+		port = simple_strtoull(options + 8, &options, 0);
+	else
+		return;
+
+	if (options && (options[0] == ','))
+		baud = simple_strtoull(options + 1, &options, 0);
+	else
+		baud = probe_baud(port);
+
+	if (port)
+		early_serial_init(port, baud);
+}
+
+void console_init(void)
+{
+	parse_earlyprintk();
+
+	if (!early_serial_base)
+		parse_console_uart8250();
+}
diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h
new file mode 100644
index 00000000000..25e13403193
--- /dev/null
+++ b/arch/x86/boot/isdigit.h
@@ -0,0 +1,21 @@
+#ifndef BOOT_ISDIGIT_H
+
+#define BOOT_ISDIGIT_H
+
+static inline int isdigit(int ch)
+{
+	return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+	if (isdigit(ch))
+		return true;
+
+	if ((ch >= 'a') && (ch <= 'f'))
+		return true;
+
+	return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbddd..cdac91ca55d 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
 #define SMALL	32		/* Must be 32 == 0x20 */
 #define SPECIAL	64		/* 0x */
 
-#define do_div(n,base) ({ \
+#define __do_div(n, base) ({ \
 int __res; \
 __res = ((unsigned long) n) % (unsigned) base; \
 n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
 		tmp[i++] = '0';
 	else
 		while (num != 0)
-			tmp[i++] = (digits[do_div(num, base)] | locase);
+			tmp[i++] = (digits[__do_div(num, base)] | locase);
 	if (i > precision)
 		precision = i;
 	size -= precision;
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index ff4b27a0fc5..def2451f46a 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -15,27 +15,12 @@
 
 #include "boot.h"
 
-#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
-
-static int	early_serial_base;
+int early_serial_base;
 
 #define XMTRDY          0x20
 
-#define DLAB		0x80
-
 #define TXR             0       /*  Transmit register (WRITE) */
-#define RXR             0       /*  Receive register  (READ)  */
-#define IER             1       /*  Interrupt Enable          */
-#define IIR             2       /*  Interrupt ID              */
-#define FCR             2       /*  FIFO control              */
-#define LCR             3       /*  Line control              */
-#define MCR             4       /*  Modem control             */
 #define LSR             5       /*  Line Status               */
-#define MSR             6       /*  Modem Status              */
-#define DLL             0       /*  Divisor Latch Low         */
-#define DLH             1       /*  Divisor latch High        */
-
-#define DEFAULT_BAUD 9600
 
 /*
  * These functions are in .inittext so they can be used to signal
@@ -152,122 +137,3 @@ int getchar_timeout(void)
 	return 0;		/* Timeout! */
 }
 
-static void early_serial_init(int port, int baud)
-{
-	unsigned char c;
-	unsigned divisor;
-
-	outb(0x3, port + LCR);	/* 8n1 */
-	outb(0, port + IER);	/* no interrupt */
-	outb(0, port + FCR);	/* no fifo */
-	outb(0x3, port + MCR);	/* DTR + RTS */
-
-	divisor	= 115200 / baud;
-	c = inb(port + LCR);
-	outb(c | DLAB, port + LCR);
-	outb(divisor & 0xff, port + DLL);
-	outb((divisor >> 8) & 0xff, port + DLH);
-	outb(c & ~DLAB, port + LCR);
-
-	early_serial_base = port;
-
-	printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud);
-}
-
-static void parse_earlyprintk(void)
-{
-	int baud = DEFAULT_BAUD;
-	char arg[32];
-	int pos = 0;
-	int port = 0;
-
-	if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
-		char *e;
-
-		if (!strncmp(arg, "serial", 6)) {
-			port = DEFAULT_SERIAL_PORT;
-			pos += 6;
-		}
-
-		if (arg[pos] == ',')
-			pos++;
-
-		if (!strncmp(arg, "ttyS", 4)) {
-			static const int bases[] = { 0x3f8, 0x2f8 };
-			int idx = 0;
-
-			if (!strncmp(arg + pos, "ttyS", 4))
-				pos += 4;
-
-			if (arg[pos++] == '1')
-				idx = 1;
-
-			port = bases[idx];
-		}
-
-		if (arg[pos] == ',')
-			pos++;
-
-		baud = simple_strtoull(arg + pos, &e, 0);
-		if (baud == 0 || arg + pos == e)
-			baud = DEFAULT_BAUD;
-	}
-
-	if (port)
-		early_serial_init(port, baud);
-}
-
-#define BASE_BAUD (1843200/16)
-static unsigned int probe_baud(int port)
-{
-	unsigned char lcr, dll, dlh;
-	unsigned int quot;
-
-	lcr = inb(port + LCR);
-	outb(lcr | DLAB, port + LCR);
-	dll = inb(port + DLL);
-	dlh = inb(port + DLH);
-	outb(lcr, port + LCR);
-	quot = (dlh << 8) | dll;
-
-	return BASE_BAUD / quot;
-}
-
-static void parse_console_uart8250(void)
-{
-	char optstr[64], *options;
-	int baud = DEFAULT_BAUD;
-	int port = 0;
-
-	/*
-	 * console=uart8250,io,0x3f8,115200n8
-	 * need to make sure it is last one console !
-	 */
-	if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
-		return;
-
-	options = optstr;
-
-	if (!strncmp(options, "uart8250,io,", 12))
-		port = simple_strtoull(options + 12, &options, 0);
-	else if (!strncmp(options, "uart,io,", 8))
-		port = simple_strtoull(options + 8, &options, 0);
-	else
-		return;
-
-	if (options && (options[0] == ','))
-		baud = simple_strtoull(options + 1, &options, 0);
-	else
-		baud = probe_baud(port);
-
-	if (port)
-		early_serial_init(port, baud);
-}
-
-void console_init(void)
-{
-	parse_earlyprintk();
-
-	if (!early_serial_base)
-		parse_console_uart8250();
-}
-- 
cgit v1.2.3


From 9f242dc10e0c3c1eb32d8c83c18650a35fd7f80d Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 2 Aug 2010 16:10:37 -0700
Subject: x86, vmware: Preset lpj values when on VMware.

When running on VMware's platform, we have seen situations where
the AP's try to calibrate the lpj values and fail to get good calibration
runs becasue of timing issues. As a result delays don't work correctly
on all cpus.

The solutions is to set preset_lpj value based on the current tsc frequency
value. This is similar to what KVM does as well.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1280790637.14933.29.camel@ank32.eng.vmware.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/vmware.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff58844..227b0448960 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
 
 static unsigned long vmware_get_tsc_khz(void)
 {
-	uint64_t tsc_hz;
+	uint64_t tsc_hz, lpj;
 	uint32_t eax, ebx, ecx, edx;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
 	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
 			 (unsigned long) tsc_hz / 1000,
 			 (unsigned long) tsc_hz % 1000);
+
+	if (!preset_lpj) {
+		lpj = ((u64)tsc_hz * 1000);
+		do_div(lpj, HZ);
+		preset_lpj = lpj;
+	}
+
 	return tsc_hz;
 }
 
-- 
cgit v1.2.3


From 8fee13a48e4879fba57725f6d9513df4bfa8e9f3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Aug 2010 16:21:22 -0700
Subject: x86, setup: enable early console output from the decompressor

This enables the decompressor output to be seen on the serial console.
Most of the code is shared with the regular boot code.

We could add printf to the decompressor if needed, but currently there
is no sufficiently compelling user.

-v2: define BOOT_BOOT_H to avoid include boot.h
-v3: early_serial_base need to be static in misc.c ?
-v4: create seperate string.c printf.c cmdline.c early_serial_console.c
     after hpa's patch that allow global variables in compressed/misc stage
-v5: remove printf.c related

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/Makefile               |  4 +-
 arch/x86/boot/compressed/cmdline.c              | 21 ++++++++++
 arch/x86/boot/compressed/early_serial_console.c |  5 +++
 arch/x86/boot/compressed/misc.c                 | 56 +++++++++++++++----------
 arch/x86/boot/compressed/misc.h                 | 38 +++++++++++++++++
 arch/x86/boot/compressed/string.c               |  4 ++
 arch/x86/boot/main.c                            |  6 +--
 7 files changed, 105 insertions(+), 29 deletions(-)
 create mode 100644 arch/x86/boot/compressed/cmdline.c
 create mode 100644 arch/x86/boot/compressed/early_serial_console.c
 create mode 100644 arch/x86/boot/compressed/misc.h
 create mode 100644 arch/x86/boot/compressed/string.c

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf245..0c229551eea 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
 
 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
 
 hostprogs-y	:= mkpiggy
 
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 00000000000..cb62f786990
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
+#include "misc.h"
+
+static unsigned long fs;
+static inline void set_fs(unsigned long seg)
+{
+	fs = seg << 4;  /* shift it back */
+}
+typedef unsigned long addr_t;
+static inline char rdfs8(addr_t addr)
+{
+	return *((char *)(fs + addr));
+}
+#include "../cmdline.c"
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+	return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+int cmdline_find_option_bool(const char *option)
+{
+	return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 00000000000..261e81fb958
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
+#include "misc.h"
+
+int early_serial_base;
+
+#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a4..8f7bef8e9ff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
-/*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
- */
-#undef CONFIG_PARAVIRT
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
-
-#include <linux/linkage.h>
-#include <linux/screen_info.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-#include <asm/page.h>
-#include <asm/boot.h>
-#include <asm/bootparam.h>
+#include "misc.h"
 
 /* WARNING!!
  * This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
 /*
  * This is set up by the setup-routine at boot-time
  */
-static struct boot_params *real_mode;		/* Pointer to real-mode data */
+struct boot_params *real_mode;		/* Pointer to real-mode data */
 static int quiet;
+static int debug;
 
 void *memset(void *s, int c, size_t n);
 void *memcpy(void *dest, const void *src, size_t n);
 
-static void __putstr(int, const char *);
-#define putstr(__x)  __putstr(0, __x)
-
 #ifdef CONFIG_X86_64
 #define memptr long
 #else
@@ -170,7 +152,21 @@ static void scroll(void)
 		vidmem[i] = ' ';
 }
 
-static void __putstr(int error, const char *s)
+#define XMTRDY          0x20
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define LSR             5       /*  Line Status               */
+static void serial_putchar(int ch)
+{
+	unsigned timeout = 0xffff;
+
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+		cpu_relax();
+
+	outb(ch, early_serial_base + TXR);
+}
+
+void __putstr(int error, const char *s)
 {
 	int x, y, pos;
 	char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
 	if (!error)
 		return;
 #endif
+	if (early_serial_base) {
+		const char *str = s;
+		while (*str) {
+			if (*str == '\n')
+				serial_putchar('\r');
+			serial_putchar(*str++);
+		}
+	}
 
 	if (real_mode->screen_info.orig_video_mode == 0 &&
 	    lines == 0 && cols == 0)
@@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 {
 	real_mode = rmode;
 
-	if (real_mode->hdr.loadflags & QUIET_FLAG)
+	if (cmdline_find_option_bool("quiet"))
 		quiet = 1;
+	if (cmdline_find_option_bool("debug"))
+		debug = 1;
 
 	if (real_mode->screen_info.orig_video_mode == 7) {
 		vidmem = (char *) 0xb0000;
@@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	lines = real_mode->screen_info.orig_video_lines;
 	cols = real_mode->screen_info.orig_video_cols;
 
+	console_init();
+	if (debug)
+		putstr("early console in decompress_kernel\n");
+
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 00000000000..a267849ac1c
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,38 @@
+#ifndef BOOT_COMPRESSED_MISC_H
+#define BOOT_COMPRESSED_MISC_H
+
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
+#undef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_32
+#define _ASM_X86_DESC_H 1
+#endif
+
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <linux/elf.h>
+#include <linux/io.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+
+#define BOOT_BOOT_H
+
+/* misc.c */
+extern struct boot_params *real_mode;		/* Pointer to real-mode data */
+void __putstr(int error, const char *s);
+#define putstr(__x)  __putstr(0, __x)
+#define puts(__x)  __putstr(0, __x)
+
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
+
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
+#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 00000000000..7995c6a4950
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,4 @@
+#include "misc.h"
+
+#include "../isdigit.h"
+#include "../string.c"
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 4ef1a33e857..40358c8905b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -132,6 +132,8 @@ void main(void)
 
 	/* Initialize the early-boot console */
 	console_init();
+	if (cmdline_find_option_bool("debug"))
+		puts("early console in setup code\n");
 
 	/* End of heap check */
 	init_heap();
@@ -171,10 +173,6 @@ void main(void)
 	/* Set the video mode */
 	set_video();
 
-	/* Parse command line for 'quiet' and pass it to decompressor. */
-	if (cmdline_find_option_bool("quiet"))
-		boot_params.hdr.loadflags |= QUIET_FLAG;
-
 	/* Do the last things and invoke protected mode */
 	go_to_protected_mode();
 }
-- 
cgit v1.2.3


From 6238b47b58480cd9c092600c05338dbe261b71ce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 2 Aug 2010 21:03:46 -0700
Subject: x86, setup: move isdigit.h to ctype.h, header files on top.

It is a subset of <ctype.h> functionality, so name it ctype.h.  Also,
reorganize header files so #include statements are clustered near the
top as they should be.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <4C5752F2.8030206@kernel.org>
---
 arch/x86/boot/boot.h              |  3 +--
 arch/x86/boot/compressed/misc.h   |  1 +
 arch/x86/boot/compressed/string.c |  2 --
 arch/x86/boot/ctype.h             | 21 +++++++++++++++++++++
 arch/x86/boot/isdigit.h           | 21 ---------------------
 5 files changed, 23 insertions(+), 25 deletions(-)
 create mode 100644 arch/x86/boot/ctype.h
 delete mode 100644 arch/x86/boot/isdigit.h

(limited to 'arch/x86')

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 00cf51cfc2e..c7093bd9f2d 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
 #include "bitops.h"
 #include <asm/cpufeature.h>
 #include <asm/processor-flags.h>
+#include "ctype.h"
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -200,8 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
 	return diff;
 }
 
-#include "isdigit.h"
-
 /* Heap -- available for dynamic lists. */
 extern char _end[];
 extern char *HEAP;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index a267849ac1c..3f19c81a620 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -20,6 +20,7 @@
 #include <asm/bootparam.h>
 
 #define BOOT_BOOT_H
+#include "../ctype.h"
 
 /* misc.c */
 extern struct boot_params *real_mode;		/* Pointer to real-mode data */
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 7995c6a4950..19b3e693cd7 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,4 +1,2 @@
 #include "misc.h"
-
-#include "../isdigit.h"
 #include "../string.c"
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 00000000000..25e13403193
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
+#ifndef BOOT_ISDIGIT_H
+
+#define BOOT_ISDIGIT_H
+
+static inline int isdigit(int ch)
+{
+	return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+	if (isdigit(ch))
+		return true;
+
+	if ((ch >= 'a') && (ch <= 'f'))
+		return true;
+
+	return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h
deleted file mode 100644
index 25e13403193..00000000000
--- a/arch/x86/boot/isdigit.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef BOOT_ISDIGIT_H
-
-#define BOOT_ISDIGIT_H
-
-static inline int isdigit(int ch)
-{
-	return (ch >= '0') && (ch <= '9');
-}
-
-static inline int isxdigit(int ch)
-{
-	if (isdigit(ch))
-		return true;
-
-	if ((ch >= 'a') && (ch <= 'f'))
-		return true;
-
-	return (ch >= 'A') && (ch <= 'F');
-}
-
-#endif
-- 
cgit v1.2.3


From 35f2915c3bd0cd6950bdd9d461de565e8feae852 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 1 Jun 2010 13:07:34 +0100
Subject: intel_scu_ipc: add definitions for vRTC related command

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 arch/x86/include/asm/intel_scu_ipc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3..03200452069 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
 #ifndef _ASM_X86_INTEL_SCU_IPC_H_
 #define  _ASM_X86_INTEL_SCU_IPC_H_
 
+#define IPCMSG_VRTC	0xFA	 /* Set vRTC device */
+
+/* Command id associated with message IPCMSG_VRTC */
+#define IPC_CMD_VRTC_SETTIME      1 /* Set time */
+#define IPC_CMD_VRTC_SETALARM     2 /* Set alarm */
+
 /* Read single register */
 int intel_scu_ipc_ioread8(u16 addr, u8 *data);
 
-- 
cgit v1.2.3


From 804f8681a99da2aa49bd7f0dab3750848d1ab1bc Mon Sep 17 00:00:00 2001
From: Sreedhara DS <sreedhara.ds@intel.com>
Date: Mon, 26 Jul 2010 10:03:10 +0100
Subject: Remove indirect read write api support.

The firmware of production devices does not support this interface so this
is dead code.

Signed-off-by: Sreedhara DS <sreedhara.ds@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 arch/x86/include/asm/intel_scu_ipc.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 03200452069..29f66793cc5 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -34,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
 /* Update single register based on the mask */
 int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
 
-/*
- * Indirect register read
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_read(u32 addr, u32 *data);
-
-/*
- * Indirect register write
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_write(u32 addr, u32 data);
-
 /* Issue commands to the SCU with or without data */
 int intel_scu_ipc_simple_command(int cmd, int sub);
 int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
-- 
cgit v1.2.3


From 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 18 May 2010 13:59:05 +0200
Subject: x86, mce: Notify about corrected events too

Notify all parties registered on the mce decoder chain about logged
correctable MCEs.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..1970ef911c9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 			mce_log(&m);
+			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
 			add_taint(TAINT_MACHINE_CHECK);
 		}
 
-- 
cgit v1.2.3


From 5d77b85458f656923b85291a4ff56ed44859ed52 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Tue, 20 Jul 2010 13:52:00 -0400
Subject: [CPUFREQ] pcc driver should check for pcch method before calling _OSC

The pcc specification documents an _OSC method that's incompatible with the
one defined as part of the ACPI spec. This shouldn't be a problem as both
are supposed to be guarded with a UUID. Unfortunately approximately nobody
(including HP, who wrote this spec) properly check the UUID on entry to the
_OSC call. Right now this could result in surprising behaviour if the pcc
driver performs an _OSC call on a machine that doesn't implement the pcc
specification. Check whether the PCCH method exists first in order to reduce
this probability.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Cc: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e7..01bd25c3c7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void)
 	struct pcc_memory_resource *mem_resource;
 	struct pcc_register_resource *reg_resource;
 	union acpi_object *out_obj, *member;
-	acpi_handle handle, osc_handle;
+	acpi_handle handle, osc_handle, pcch_handle;
 	int ret = 0;
 
 	status = acpi_get_handle(NULL, "\\_SB", &handle);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
 
+	status = acpi_get_handle(handle, "PCCH", &pcch_handle);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
 	status = acpi_get_handle(handle, "_OSC", &osc_handle);
 	if (ACPI_SUCCESS(status)) {
 		ret = pcc_cpufreq_do_osc(&osc_handle);
-- 
cgit v1.2.3


From 0d9715d64fe118dd0957a29e344972b8d3f960e7 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Fri, 23 Jul 2010 23:06:52 +0100
Subject: [CPUFREQ] fix double freeing in error path of pcc-cpufreq

Prevent double freeing on error path.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 01bd25c3c7c..900702888bf 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
 		return -ENODEV;
 
 	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (out_obj->type != ACPI_TYPE_BUFFER)
+		return -ENODEV;
 
 	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (errors)
+		return -ENODEV;
 
 	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (!(supported & 0x1))
+		return -ENODEV;
 
 out_free:
 	kfree(output.pointer);
-- 
cgit v1.2.3


From 6ebdf777ba034d2b54c99f28a4b18dabf286d8e5 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 15 Jul 2010 11:44:00 -0400
Subject: [CPUFREQ] Fix PCC driver error path

The PCC cpufreq driver unmaps the mailbox address range if any CPUs fail to
initialise, but doesn't do anything to remove the registered CPUs from the
cpufreq core resulting in failures further down the line. We're better off
simply returning a failure - the cpufreq core will unregister us cleanly if
we end up with no successfully registered CPUs. Tidy up the failure path
and also add a sanity check to ensure that the firmware gives us a realistic
frequency - the core deals badly with that being set to 0.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Cc: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 900702888bf..a36de5bbb62 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -541,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	if (!pcch_virt_addr) {
 		result = -1;
-		goto pcch_null;
+		goto out;
 	}
 
 	result = pcc_get_offset(cpu);
 	if (result) {
 		dprintk("init: PCCP evaluation failed\n");
-		goto free;
+		goto out;
 	}
 
 	policy->max = policy->cpuinfo.max_freq =
@@ -556,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		ioread32(&pcch_hdr->minimum_frequency) * 1000;
 	policy->cur = pcc_get_freq(cpu);
 
+	if (!policy->cur) {
+		dprintk("init: Unable to get current CPU frequency\n");
+		result = -EINVAL;
+		goto out;
+	}
+
 	dprintk("init: policy->max is %d, policy->min is %d\n",
 		policy->max, policy->min);
-
-	return 0;
-free:
-	pcc_clear_mapping();
-	free_percpu(pcc_cpu_info);
-pcch_null:
+out:
 	return result;
 }
 
-- 
cgit v1.2.3


From c2f4a2c6e08c7635316dfd25ef706e9104384c56 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 8 Jul 2010 17:55:30 +0200
Subject: [CPUFREQ] powernow-k8: Limit Pstate transition latency check

The Pstate transition latency check was added for broken F10h BIOSen
which wrongly contain a value of 0 for transition and bus master
latency. Fam11h and later, however, (will) have similar transition
latency so extend that behavior for them too.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7ec2123838e..3e90cce3dc8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1023,13 +1023,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
 	}
 	if (max_latency == 0) {
 		/*
-		 * Fam 11h always returns 0 as transition latency.
-		 * This is intended and means "very fast". While cpufreq core
-		 * and governors currently can handle that gracefully, better
-		 * set it to 1 to avoid problems in the future.
-		 * For all others it's a BIOS bug.
+		 * Fam 11h and later may return 0 as transition latency. This
+		 * is intended and means "very fast". While cpufreq core and
+		 * governors currently can handle that gracefully, better set it
+		 * to 1 to avoid problems in the future.
 		 */
-		if (boot_cpu_data.x86 != 0x11)
+		if (boot_cpu_data.x86 < 0x11)
 			printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
 				"latency\n");
 		max_latency = 1;
-- 
cgit v1.2.3


From 298decfbc44e9a4cb7862ae1b7dfc4e1ba3551b9 Mon Sep 17 00:00:00 2001
From: Marti Raudsepp <marti@juffo.org>
Date: Wed, 20 Jan 2010 19:19:33 +0200
Subject: [CPUFREQ] powernow-k8: On load failure, remind the user to enable
 support in BIOS setup

On Wed, 2010-01-20 at 16:56 +0100, Thomas Renninger wrote:
> But most often this happens if people upgrade their CPU and do not
> update their BIOS.
> Or the vendor does not recognise the new CPU even if the BIOS got
> updated.

Maybe some of those people just didn't realize it was disabled in BIOS?
If you tell users that it's a firmware bug then they'll probably just
give up.

> The itself message might be an enhancment, IMO it's not worth a patch.

Why do you think so? I spent an hour on hunting down the BIOS upgrade,
only to find that it didn't improve anything. It was a day later that I
realized that it might be a BIOS option; and the option was literally
the _last_ option in the whole BIOS setup. :)

This message would have saved the day.

> But do not revert the FW_BUG part!

Sure, you have a point here.

How about this patch?
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8..c48b44b3b43 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
 	 * www.amd.com
 	 */
 	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+	printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+		" and Cool'N'Quiet support is enabled in BIOS setup\n");
 	return -ENODEV;
 }
 
-- 
cgit v1.2.3


From 6b72e3934b42930fd40fc42fe762d21be413301c Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Apr 2010 13:17:35 +0200
Subject: [CPUFREQ] acpi-cpufreq: Fix CPU_ANY CPUFREQ_{PRE,POST}CHANGE
 notification

Signed-off-by: Thomas Renninger <trenn@suse.de>
CC: venki@google.com
CC: davej@redhat.com
CC: arjan@infradead.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40e..cee7aa949c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -351,7 +351,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = perf->states[perf->state].core_frequency * 1000;
 	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu(i, cmd.mask) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -367,7 +367,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	for_each_cpu(i, cmd.mask) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
-- 
cgit v1.2.3


From 6f4f2723d08534fd4e407e1ef8500b0f4d12c30c Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Apr 2010 13:17:36 +0200
Subject: [CPUFREQ] x86 cpufreq: Make trace_power_frequency cpufreq driver
 independent

and fix the broken case if a core's frequency depends on others.

trace_power_frequency was only implemented in a rather ungeneric way
in acpi-cpufreq driver's target() function only.
-> Move the call to trace_power_frequency to
   cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE
   notifier is triggered.
   This will support power frequency tracing by all cpufreq drivers

trace_power_frequency did not trace frequency changes correctly when
the userspace governor was used or when CPU cores' frequency depend
on each other.
-> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu
   which gets switched automatically fixes this.

Robert Schoene provided some important fixes on top of my initial
quick shot version which are integrated in this patch:
- Forgot some changes in power_end trace (TP_printk/variable names)
- Variable dummy in power_end must now be cpu_id
- Use static 64 bit variable instead of unsigned int for cpu_id

Signed-off-by: Thomas Renninger <trenn@suse.de>
CC: davej@redhat.com
CC: arjan@infradead.org
CC: linux-kernel@vger.kernel.org
CC: robert.schoene@tu-dresden.de
Tested-by: robert.schoene@tu-dresden.de
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 ---
 arch/x86/kernel/process.c                  | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cee7aa949c3..246cd3afbb5 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
-#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..787572d43d9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -371,7 +371,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	trace_power_start(POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 static void mwait_idle(void)
 {
 	if (!need_resched()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -478,7 +478,7 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	trace_power_start(POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
-- 
cgit v1.2.3


From ccc5638a20b0eb3a66666d9d4dd8fe8f5ad40386 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 3 Jul 2010 20:03:55 +0400
Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: use for_each_pci_dev()

Use for_each_pci_dev() to simplify the code.

Signed-off-by: Kulikov Vasiliy <segooon@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e..8c3325fee77 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -199,7 +199,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
 	}
 
 	/* detect which companion chip is used */
-	while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+	for_each_pci_dev(gx_pci) {
 		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
 			return gx_pci;
 	}
-- 
cgit v1.2.3


From 55c789bb2bcdcaa8f1f60687b4a9dbd02ffddd88 Mon Sep 17 00:00:00 2001
From: Peter Huewe <peterhuewe@gmx.de>
Date: Thu, 15 Jul 2010 20:36:41 +0200
Subject: [CPUFREQ] Convert pci_table entries to PCI_VDEVICE (if PCI_ANY_ID is
 used)

This patch converts pci_table entries, where .subvendor=PCI_ANY_ID and
.subdevice=PCI_ANY_ID, .class=0 and .class_mask=0, to use the
PCI_VDEVICE macro, and thus improves readability.

Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 8c3325fee77..32974cf8423 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
  *	Low Level chipset interface				*
  ****************************************************************/
 static struct pci_device_id gx_chipset_tbl[] __initdata = {
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
-		PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
-		PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
-		PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
 	{ 0, },
 };
 
-- 
cgit v1.2.3


From b30d3304c9c068ccfe6940232834768af75f8c9a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 8 Jul 2010 18:05:14 +0200
Subject: [CPUFREQ] powernow-k8: Fix misleading variable naming

rdmsr() takes the lower 32 bits as a second argument and the high 32 as
a third. Fix the names accordingly since they were swapped.

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c48b44b3b43..90cab2d4ac0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -912,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 {
 	int i;
 	u32 hi = 0, lo = 0;
-	rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
-	data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+	rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+	data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
 
 	for (i = 0; i < data->acpi_data.state_count; i++) {
 		u32 index;
-- 
cgit v1.2.3


From 7e2d81122052c83feeddbebf706b6d53fba7996d Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:28:49 +0800
Subject: [CPUFREQ] Fix section mismatch for longrun_cpu_init.

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longrun.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c5..fc09f142d94 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
  * TMTA rules:
  * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
  */
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
-						   unsigned int *high_freq)
+static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+						      unsigned int *high_freq)
 {
 	u32 msr_lo, msr_hi;
 	u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 }
 
 
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
 {
 	int result = 0;
 
-- 
cgit v1.2.3


From 2530573e45c5846cd238db78651f0d236fc78aab Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:29:03 +0800
Subject: [CPUFREQ] Fix section mismatch for longhaul_cpu_init.

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c. Use the
__cpuinitdata for data used by the routines marked as __cpuinit.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longhaul.c |  6 +++---
 arch/x86/kernel/cpu/cpufreq/longhaul.h | 26 +++++++++++++-------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f826..03162dac627 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
 }
 
 
-static int __init longhaul_get_ranges(void)
+static int __cpuinit longhaul_get_ranges(void)
 {
 	unsigned int i, j, k = 0;
 	unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
 }
 
 
-static void __init longhaul_setup_voltagescaling(void)
+static void __cpuinit longhaul_setup_voltagescaling(void)
 {
 	union msr_longhaul longhaul;
 	struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
 	return 0;
 }
 
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f7..cbf48fbca88 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
 /*
  * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
  */
-static const int __initdata samuel1_mults[16] = {
+static const int __cpuinitdata samuel1_mults[16] = {
 	-1, /* 0000 -> RESERVED */
 	30, /* 0001 ->  3.0x */
 	40, /* 0010 ->  4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
 	-1, /* 1111 -> RESERVED */
 };
 
-static const int __initdata samuel1_eblcr[16] = {
+static const int __cpuinitdata samuel1_eblcr[16] = {
 	50, /* 0000 -> RESERVED */
 	30, /* 0001 ->  3.0x */
 	40, /* 0010 ->  4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
 /*
  * VIA C3 Samuel2 Stepping 1->15
  */
-static const int __initdata samuel2_eblcr[16] = {
+static const int __cpuinitdata samuel2_eblcr[16] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
 /*
  * VIA C3 Ezra
  */
-static const int __initdata ezra_mults[16] = {
+static const int __cpuinitdata ezra_mults[16] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
 	120, /* 1111 -> 12.0x */
 };
 
-static const int __initdata ezra_eblcr[16] = {
+static const int __cpuinitdata ezra_eblcr[16] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
 /*
  * VIA C3 (Ezra-T) [C5M].
  */
-static const int __initdata ezrat_mults[32] = {
+static const int __cpuinitdata ezrat_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
 	-1,  /* 1111 -> RESERVED (12.0x) */
 };
 
-static const int __initdata ezrat_eblcr[32] = {
+static const int __cpuinitdata ezrat_eblcr[32] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
 /*
  * VIA C3 Nehemiah */
 
-static const int __initdata  nehemiah_mults[32] = {
+static const int __cpuinitdata nehemiah_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	-1, /* 0001 -> 16.0x */
 	40,  /* 0010 ->  4.0x */
@@ -270,7 +270,7 @@ static const int __initdata  nehemiah_mults[32] = {
 	-1, /* 1111 -> 12.0x */
 };
 
-static const int __initdata nehemiah_eblcr[32] = {
+static const int __cpuinitdata nehemiah_eblcr[32] = {
 	50,  /* 0000 ->  5.0x */
 	160, /* 0001 -> 16.0x */
 	40,  /* 0010 ->  4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
 	unsigned short pos;
 };
 
-static const struct mV_pos __initdata vrm85_mV[32] = {
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
 	{1250, 8},	{1200, 6},	{1150, 4},	{1100, 2},
 	{1050, 0},	{1800, 30},	{1750, 28},	{1700, 26},
 	{1650, 24},	{1600, 22},	{1550, 20},	{1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
 	{1475, 17},	{1425, 15},	{1375, 13},	{1325, 11}
 };
 
-static const unsigned char __initdata mV_vrm85[32] = {
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
 	0x04,	0x14,	0x03,	0x13,	0x02,	0x12,	0x01,	0x11,
 	0x00,	0x10,	0x0f,	0x1f,	0x0e,	0x1e,	0x0d,	0x1d,
 	0x0c,	0x1c,	0x0b,	0x1b,	0x0a,	0x1a,	0x09,	0x19,
 	0x08,	0x18,	0x07,	0x17,	0x06,	0x16,	0x05,	0x15
 };
 
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
 	{1750, 31},	{1700, 30},	{1650, 29},	{1600, 28},
 	{1550, 27},	{1500, 26},	{1450, 25},	{1400, 24},
 	{1350, 23},	{1300, 22},	{1250, 21},	{1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
 	{675, 3},	{650, 2},	{625, 1},	{600, 0}
 };
 
-static const unsigned char __initdata mV_mobilevrm[32] = {
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
 	0x1f,	0x1e,	0x1d,	0x1c,	0x1b,	0x1a,	0x19,	0x18,
 	0x17,	0x16,	0x15,	0x14,	0x13,	0x12,	0x11,	0x10,
 	0x0f,	0x0e,	0x0d,	0x0c,	0x0b,	0x0a,	0x09,	0x08,
-- 
cgit v1.2.3


From 307069cf6c53632adc27de4f49bf5d1d67cb87bb Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:29:16 +0800
Subject: [CPUFREQ] Fix section mismatch for powernow_cpu_init in powernow-k7.c

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e..4a45fd6e41b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
  * We will then get the same kind of behaviour already tested under
  * the "well-known" other OS.
  */
-static int __init fixup_sgtc(void)
+static int __cpuinit fixup_sgtc(void)
 {
 	unsigned int sgtc;
 	unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
 }
 
 
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
 {
 	printk(KERN_WARNING PFX
 		"%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
  * A BIOS update is all that can save them.
  * Mention this, and disable cpufreq.
  */
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
 	{
 		.callback = acer_cpufreq_pst,
 		.ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
 	{ }
 };
 
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
 {
 	union msr_fidvidstatus fidvidstatus;
 	int result;
-- 
cgit v1.2.3


From 9d1f44ee206a23b975d7d7c6f759efb25e0e61ac Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 3 Aug 2010 13:47:30 -0400
Subject: [CPUFREQ] Remove pointless printk from p4-clockmod.

The only machines this is triggering on should be supported by
acpi-cpufreq or acpi's internal throttling.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b0..bd1cac747f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 		}
 	}
 
-	if (c->x86 != 0xF) {
-		if (!cpu_has(c, X86_FEATURE_EST))
-			printk(KERN_WARNING PFX "Unknown CPU. "
-				"Please send an e-mail to "
-				"<cpufreq@vger.kernel.org>\n");
+	if (c->x86 != 0xF)
 		return 0;
-	}
 
 	/* on P-4s, the TSC runs with constant frequency independent whether
 	 * throttling is active or not. */
-- 
cgit v1.2.3


From cb84b19474384c572ba3aa2345815e555112ebf5 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:43 -0700
Subject: x86, hwmon: Package Level Thermal/Power: pkgtemp hwmon driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a hwmon driver for package level thermal control. The driver
dumps package level thermal information through sysfs interface so that upper
level application (e.g. lm_sensor) can retrive the information.

Instead of having the package level hwmon code in coretemp, I write a seperate
driver pkgtemp because:

First, package level thermal sensors include not only sensors for each core,
but also sensors for uncore, memory controller or other components in the
package. Logically it will be clear to have a seperate hwmon driver for package
level hwmon to monitor wider range of sensors in a package. Merging package
thermal driver into core thermal driver doesn't make sense and may mislead.

Secondly, merging the two drivers together may cause coding mess. It's easier
to include various package level sensors info if more sensor information is
implemented. Coretemp code needs to consider a lot of legacy machine cases.
Pkgtemp code only considers platform starting from Sandy Bridge.

On a 1Sx4Cx2T Sandy Bridge platform, lm-sensors dumps the pkgtemp and coretemp:

pkgtemp-isa-0000
Adapter: ISA adapter
physical id 0: +33.0°C  (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0000
Adapter: ISA adapter
Core 0:      +32.0°C  (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0001
Adapter: ISA adapter
Core 1:      +32.0°C  (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0002
Adapter: ISA adapter
Core 2:      +32.0°C  (high = +79.0°C, crit = +99.0°C)

coretemp-isa-0003
Adapter: ISA adapter
Core 3:      +32.0°C  (high = +79.0°C, crit = +99.0°C)

[ hpa: folded v3 patch removing improper global variable "SHOW" ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-3-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/configs/i386_defconfig   | 1 +
 arch/x86/configs/x86_64_defconfig | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654..e3a32431ca1 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1471,6 +1471,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
 # CONFIG_SENSORS_IT87 is not set
 # CONFIG_SENSORS_LM63 is not set
 # CONFIG_SENSORS_LM75 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a..4251f837205 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1456,6 +1456,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_PKGTEMP is not set
 # CONFIG_SENSORS_IT87 is not set
 # CONFIG_SENSORS_LM63 is not set
 # CONFIG_SENSORS_LM75 is not set
-- 
cgit v1.2.3


From 55d435a227bd28c77afab326de44dfacc0b15059 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:44 -0700
Subject: x86, hwmon: Package Level Thermal/Power: thermal throttling handler

Add package level thermal throttle interrupt support. The interrupt handler
increases package level thermal throttle count. It also logs the event in MCE
log.

The package level thermal throttle interrupt happens across threads in a
package. Each thread handles the interrupt individually. User level application
is supposed to retrieve correct event count and log based on package/thread
topology. This is the same situation for core level interrupt handler. In the
future, interrupt may be reported only per package or per core.

core_throttle_count and package_throttle_count are used for user interface.
Previously only throttle_count is used for core throttle count. If you think
new core_throttle_count name breaks user interface, I can change this part.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-4-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 +++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf971..d307f9f64c2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -37,7 +37,7 @@
 /*
  * Current thermal throttling state:
  */
-struct thermal_state {
+struct _thermal_state {
 	bool			is_throttled;
 
 	u64			next_check;
@@ -45,6 +45,11 @@ struct thermal_state {
 	unsigned long		last_throttle_count;
 };
 
+struct thermal_state {
+	struct _thermal_state core;
+	struct _thermal_state package;
+};
+
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en	= ATOMIC_INIT(0);
@@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly;
 
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)				\
-	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+	static SYSDEV_ATTR(_name, 0444,					\
+			   therm_throt_sysdev_show_##_name,		\
+				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(name)			\
+#define define_therm_throt_sysdev_show_func(level, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##name(				\
+static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 			struct sys_device *dev,				\
 			struct sysdev_attribute *attr,			\
 			char *buf)					\
@@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name(				\
 	ssize_t ret;							\
 									\
 	preempt_disable();	/* CPU hotplug */			\
-	if (cpu_online(cpu))						\
+	if (cpu_online(cpu)) {						\
 		ret = sprintf(buf, "%lu\n",				\
-			      per_cpu(thermal_state, cpu).name);	\
-	else								\
+			      per_cpu(thermal_state, cpu).level.name);	\
+	} else								\
 		ret = 0;						\
 	preempt_enable();						\
 									\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_throttle_count.attr,
+	&attr_core_throttle_count.attr,
 	NULL
 };
 
@@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = {
  *          1 : Event should be logged further, and a message has been
  *              printed to the syslog.
  */
-static int therm_throt_process(bool is_throttled)
+#define CORE_LEVEL	0
+#define PACKAGE_LEVEL	1
+static int therm_throt_process(bool is_throttled, int level)
 {
-	struct thermal_state *state;
+	struct _thermal_state *state;
 	unsigned int this_cpu;
 	bool was_throttled;
 	u64 now;
 
 	this_cpu = smp_processor_id();
 	now = get_jiffies_64();
-	state = &per_cpu(thermal_state, this_cpu);
+	if (level == CORE_LEVEL)
+		state = &per_cpu(thermal_state, this_cpu).core;
+	else
+		state = &per_cpu(thermal_state, this_cpu).package;
 
 	was_throttled = state->is_throttled;
 	state->is_throttled = is_throttled;
@@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled)
 
 	/* if we just entered the thermal event */
 	if (is_throttled) {
-		printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+		printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+		      this_cpu,
+		      level == CORE_LEVEL ? "Core" : "Package",
+		      state->throttle_count);
 
 		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
 	if (was_throttled) {
-		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+		printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+		       this_cpu,
+		       level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
 
@@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled)
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 {
-	return sysfs_create_group(&sys_dev->kobj,
-				  &thermal_throttle_attr_group);
+	int err;
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+	err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	if (err)
+		return err;
+
+	if (cpu_has(c, X86_FEATURE_PTS))
+		err = sysfs_add_file_to_group(&sys_dev->kobj,
+					      &attr_package_throttle_count.attr,
+					      thermal_throttle_attr_group.name);
+
+	return err;
 }
 
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device);
 static void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-	if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
+	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+				CORE_LEVEL) != 0)
 		mce_log_therm_throt_event(msr_val);
+
+	if (cpu_has(c, X86_FEATURE_PTS)) {
+		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+					PACKAGE_LEVEL) != 0)
+			/*
+			 * Set up the most significant bit to notify mce log
+			 * that this thermal event is a package level event.
+			 * This is a temp solution. May be changed in the future
+			 * with mce log infrasture.
+			 */
+			mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+	}
 }
 
 static void unexpected_thermal_interrupt(void)
@@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	wrmsr(MSR_IA32_THERM_INTERRUPT,
 		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
 
+	if (cpu_has(c, X86_FEATURE_PTS)) {
+		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			l | (PACKAGE_THERM_INT_LOW_ENABLE
+		  | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+	}
+
 	smp_thermal_vector = intel_thermal_interrupt;
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-- 
cgit v1.2.3


From 0199114c31798af5b83841b21759b64171060d9b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:45 -0700
Subject: x86, hwmon: Package Level Thermal/Power: power limit

Power limit notification feature is published in Intel 64 and IA-32
Architectures SDMV Vol 3A 14.5.6 Power Limit Notification.

It is implemented first on Intel Sandy Bridge platform.

The patch handles notification interrupt. Interrupt handler dumps power limit
information in log_buf, logs the event in mce log, and increases the event
counters (core_power_limit and package_power_limit). Upper level applications
could use the data to detect system health or diagnose functionality/performance
issues.

In the future, the event could be handled in a more fancy way.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-5-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 183 ++++++++++++++++++++++---------
 1 file changed, 129 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d307f9f64c2..c2a8b26d4fe 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,25 @@
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
 
+#define THERMAL_THROTTLING_EVENT	0
+#define POWER_LIMIT_EVENT		1
+
 /*
- * Current thermal throttling state:
+ * Current thermal event state:
  */
 struct _thermal_state {
-	bool			is_throttled;
-
+	bool			new_event;
+	int			event;
 	u64			next_check;
-	unsigned long		throttle_count;
-	unsigned long		last_throttle_count;
+	unsigned long		count;
+	unsigned long		last_count;
 };
 
 struct thermal_state {
-	struct _thermal_state core;
-	struct _thermal_state package;
+	struct _thermal_state core_throttle;
+	struct _thermal_state core_power_limit;
+	struct _thermal_state package_throttle;
+	struct _thermal_state package_power_limit;
 };
 
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly;
 			   therm_throt_sysdev_show_##_name,		\
 				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(level, name)		\
+#define define_therm_throt_sysdev_show_func(event, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##level##_##name(		\
+static ssize_t therm_throt_sysdev_show_##event##_##name(		\
 			struct sys_device *dev,				\
 			struct sysdev_attribute *attr,			\
 			char *buf)					\
@@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 	preempt_disable();	/* CPU hotplug */			\
 	if (cpu_online(cpu)) {						\
 		ret = sprintf(buf, "%lu\n",				\
-			      per_cpu(thermal_state, cpu).level.name);	\
+			      per_cpu(thermal_state, cpu).event.name);	\
 	} else								\
 		ret = 0;						\
 	preempt_enable();						\
@@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
 define_therm_throt_sysdev_one_ro(core_throttle_count);
 
-define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
 define_therm_throt_sysdev_one_ro(package_throttle_count);
 
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
+
 static struct attribute *thermal_throttle_attrs[] = {
 	&attr_core_throttle_count.attr,
 	NULL
 };
 
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
 	.attrs	= thermal_throttle_attrs,
 	.name	= "thermal_throttle"
 };
 #endif /* CONFIG_SYSFS */
 
+#define CORE_LEVEL	0
+#define PACKAGE_LEVEL	1
+
 /***
  * therm_throt_process - Process thermal throttling event from interrupt
  * @curr: Whether the condition is current or not (boolean), since the
@@ -116,49 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
  *          1 : Event should be logged further, and a message has been
  *              printed to the syslog.
  */
-#define CORE_LEVEL	0
-#define PACKAGE_LEVEL	1
-static int therm_throt_process(bool is_throttled, int level)
+static int therm_throt_process(bool new_event, int event, int level)
 {
 	struct _thermal_state *state;
-	unsigned int this_cpu;
-	bool was_throttled;
+	unsigned int this_cpu = smp_processor_id();
+	bool old_event;
 	u64 now;
+	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
 
-	this_cpu = smp_processor_id();
 	now = get_jiffies_64();
-	if (level == CORE_LEVEL)
-		state = &per_cpu(thermal_state, this_cpu).core;
-	else
-		state = &per_cpu(thermal_state, this_cpu).package;
+	if (level == CORE_LEVEL) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			state = &pstate->core_throttle;
+		else if (event == POWER_LIMIT_EVENT)
+			state = &pstate->core_power_limit;
+		else
+			 return 0;
+	} else if (level == PACKAGE_LEVEL) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			state = &pstate->package_throttle;
+		else if (event == POWER_LIMIT_EVENT)
+			state = &pstate->package_power_limit;
+		else
+			return 0;
+	} else
+		return 0;
 
-	was_throttled = state->is_throttled;
-	state->is_throttled = is_throttled;
+	old_event = state->new_event;
+	state->new_event = new_event;
 
-	if (is_throttled)
-		state->throttle_count++;
+	if (new_event)
+		state->count++;
 
 	if (time_before64(now, state->next_check) &&
-			state->throttle_count != state->last_throttle_count)
+			state->count != state->last_count)
 		return 0;
 
 	state->next_check = now + CHECK_INTERVAL;
-	state->last_throttle_count = state->throttle_count;
+	state->last_count = state->count;
 
 	/* if we just entered the thermal event */
-	if (is_throttled) {
-		printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
-		      this_cpu,
-		      level == CORE_LEVEL ? "Core" : "Package",
-		      state->throttle_count);
+	if (new_event) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package",
+				state->count);
+		else
+			printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package",
+				state->count);
 
 		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
-	if (was_throttled) {
-		printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-		       this_cpu,
-		       level == CORE_LEVEL ? "Core" : "Package");
+	if (old_event) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package");
+		else
+			printk(KERN_INFO "CPU%d: %s power limit normal\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
 
@@ -172,21 +207,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 	int err;
 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
-	err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
 	if (err)
 		return err;
 
+	if (cpu_has(c, X86_FEATURE_PLN))
+		err = sysfs_add_file_to_group(&sys_dev->kobj,
+					      &attr_core_power_limit_count.attr,
+					      thermal_attr_group.name);
 	if (cpu_has(c, X86_FEATURE_PTS))
 		err = sysfs_add_file_to_group(&sys_dev->kobj,
 					      &attr_package_throttle_count.attr,
-					      thermal_throttle_attr_group.name);
+					      thermal_attr_group.name);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			err = sysfs_add_file_to_group(&sys_dev->kobj,
+					&attr_package_power_limit_count.attr,
+					thermal_attr_group.name);
 
 	return err;
 }
 
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 {
-	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
 }
 
 /* Mutex protecting device creation against CPU hotplug: */
@@ -257,6 +300,17 @@ device_initcall(thermal_throttle_init_device);
 
 #endif /* CONFIG_SYSFS */
 
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED		(0)
+#define CORE_POWER_LIMIT	((__u64)1 << 62)
+#define PACKAGE_THROTTLED	((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT	((__u64)3 << 62)
+
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
@@ -264,21 +318,31 @@ static void intel_thermal_interrupt(void)
 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
 	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+				THERMAL_THROTTLING_EVENT,
 				CORE_LEVEL) != 0)
-		mce_log_therm_throt_event(msr_val);
+		mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+	if (cpu_has(c, X86_FEATURE_PLN))
+		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+					POWER_LIMIT_EVENT,
+					CORE_LEVEL) != 0)
+			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
 
 	if (cpu_has(c, X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL) != 0)
-			/*
-			 * Set up the most significant bit to notify mce log
-			 * that this thermal event is a package level event.
-			 * This is a temp solution. May be changed in the future
-			 * with mce log infrasture.
-			 */
-			mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			if (therm_throt_process(msr_val &
+					PACKAGE_THERM_STATUS_POWER_LIMIT,
+					POWER_LIMIT_EVENT,
+					PACKAGE_LEVEL) != 0)
+				mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+							  | msr_val);
 	}
 }
 
@@ -381,14 +445,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT,
-		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+	if (cpu_has(c, X86_FEATURE_PLN))
+		wrmsr(MSR_IA32_THERM_INTERRUPT,
+		      l | (THERM_INT_LOW_ENABLE
+			| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+	else
+		wrmsr(MSR_IA32_THERM_INTERRUPT,
+		      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
 
 	if (cpu_has(c, X86_FEATURE_PTS)) {
 		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
-			l | (PACKAGE_THERM_INT_LOW_ENABLE
-		  | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			      l | (PACKAGE_THERM_INT_LOW_ENABLE
+				| PACKAGE_THERM_INT_HIGH_ENABLE
+				| PACKAGE_THERM_INT_PLN_ENABLE), h);
+		else
+			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			      l | (PACKAGE_THERM_INT_LOW_ENABLE
+				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
 	}
 
 	smp_thermal_vector = intel_thermal_interrupt;
-- 
cgit v1.2.3


From 8a22b9996b001c88f2bfb54c6de6a05fc39e177a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 12 Jul 2010 11:49:59 -0700
Subject: xen: drop xen_sched_clock in favour of using plain wallclock time

xen_sched_clock only counts unstolen time.  In principle this should
be useful to the Linux scheduler so that it knows how much time a process
actually consumed.  But in practice this doesn't work very well as the
scheduler expects the sched_clock time to be synchronized between
cpus.  It also uses sched_clock to measure the time a task spends
sleeping, in which case "unstolen time" isn't meaningful.

So just use plain xen_clocksource_read to return wallclock nanoseconds
for sched_clock.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c |  2 +-
 arch/x86/xen/time.c      | 39 ---------------------------------------
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 399bed2de88..fef034a04c2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -926,7 +926,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 };
 
 static const struct pv_time_ops xen_time_ops __initdata = {
-	.sched_clock = xen_sched_clock,
+	.sched_clock = xen_clocksource_read,
 };
 
 static const struct pv_cpu_ops xen_cpu_ops __initdata = {
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 32764b8880b..e90360ff4a0 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -155,45 +155,6 @@ static void do_stolen_accounting(void)
 	account_idle_ticks(ticks);
 }
 
-/*
- * Xen sched_clock implementation.  Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
-	struct vcpu_runstate_info state;
-	cycle_t now;
-	u64 ret;
-	s64 offset;
-
-	/*
-	 * Ideally sched_clock should be called on a per-cpu basis
-	 * anyway, so preempt should already be disabled, but that's
-	 * not current practice at the moment.
-	 */
-	preempt_disable();
-
-	now = xen_clocksource_read();
-
-	get_runstate_snapshot(&state);
-
-	WARN_ON(state.state != RUNSTATE_running);
-
-	offset = now - state.state_entry_time;
-	if (offset < 0)
-		offset = 0;
-
-	ret = state.time[RUNSTATE_blocked] +
-		state.time[RUNSTATE_running] +
-		offset;
-
-	preempt_enable();
-
-	return ret;
-}
-
-
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-- 
cgit v1.2.3


From c06ee78d73fd24e8d8a65f16380f6a0551107e1b Mon Sep 17 00:00:00 2001
From: Mukesh Rathor <mukesh.rathor@oracle.com>
Date: Mon, 19 Jul 2010 10:25:08 -0700
Subject: xen: support large numbers of CPUs with vcpu info placement

When vcpu info placement is supported, we're not limited to MAX_VIRT_CPUS
vcpus.  However, if it isn't supported, then ignore any excess vcpus.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fef034a04c2..90a3e802676 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -97,6 +97,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  */
 static int have_vcpu_info_placement = 1;
 
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+	if (setup_max_cpus > MAX_VIRT_CPUS)
+		setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
 static void xen_vcpu_setup(int cpu)
 {
 	struct vcpu_register_vcpu_info info;
@@ -104,13 +112,17 @@ static void xen_vcpu_setup(int cpu)
 	struct vcpu_info *vcpup;
 
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
-	if (!have_vcpu_info_placement)
-		return;		/* already tested, not available */
+	if (cpu < MAX_VIRT_CPUS)
+		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
-	vcpup = &per_cpu(xen_vcpu_info, cpu);
+	if (!have_vcpu_info_placement) {
+		if (cpu >= MAX_VIRT_CPUS)
+			clamp_max_cpus();
+		return;
+	}
 
+	vcpup = &per_cpu(xen_vcpu_info, cpu);
 	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
@@ -125,6 +137,7 @@ static void xen_vcpu_setup(int cpu)
 	if (err) {
 		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
 		have_vcpu_info_placement = 0;
+		clamp_max_cpus();
 	} else {
 		/* This cpu is using the registered vcpu info, even if
 		   later ones fail to. */
-- 
cgit v1.2.3


From f09f6d194d85043e0eb105a577e7ad6d8170ab66 Mon Sep 17 00:00:00 2001
From: Donald Dutile <ddutile@redhat.com>
Date: Thu, 15 Jul 2010 14:56:49 -0400
Subject: Xen: register panic notifier to take crashes of xen guests on panic

Register a panic notifier so that when the guest crashes it can shut
down the domain and indicate it was a crash to the host.

Signed-off-by: Donald Dutile <ddutile@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 20 ++++++++++++++++++++
 arch/x86/xen/setup.c     |  2 ++
 arch/x86/xen/xen-ops.h   |  2 ++
 3 files changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 90a3e802676..d99522e8f03 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1040,6 +1040,26 @@ static void xen_crash_shutdown(struct pt_regs *regs)
 	xen_reboot(SHUTDOWN_crash);
 }
 
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct sched_shutdown r = { .reason = SHUTDOWN_crash};
+
+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
+		BUG();
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+	.notifier_call= xen_panic_event,
+};
+
+int xen_panic_handler_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+	return 0;
+}
+
 static const struct machine_ops __initdata xen_machine_ops = {
 	.restart = xen_restart,
 	.halt = xen_machine_halt,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 9deb6bab6c7..328b0030542 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -226,6 +226,8 @@ void __init xen_arch_setup(void)
 	struct physdev_set_iopl set_iopl;
 	int rc;
 
+	xen_panic_handler_init();
+
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bc..00d59d608ed 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -101,4 +101,6 @@ void xen_sysret32(void);
 void xen_sysret64(void);
 void xen_adjust_exception_frame(void);
 
+extern int xen_panic_handler_init(void);
+
 #endif /* XEN_OPS_H */
-- 
cgit v1.2.3


From 086748e52fb072ff0935ba4512e29c421bd5b716 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 3 Aug 2010 14:55:14 -0700
Subject: xen/panic: use xen_reboot and fix smp_send_stop

Offline vcpu when using stop_self.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 5 +----
 arch/x86/xen/smp.c       | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d99522e8f03..3c4da8bee06 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1043,10 +1043,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
 static int
 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
-	struct sched_shutdown r = { .reason = SHUTDOWN_crash};
-
-	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
-		BUG();
+	xen_reboot(SHUTDOWN_crash);
 	return NOTIFY_DONE;
 }
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd313..25f232b18a8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -394,6 +394,8 @@ static void stop_self(void *v)
 	load_cr3(swapper_pg_dir);
 	/* should set up a minimal gdt */
 
+	set_cpu_online(cpu, false);
+
 	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
 	BUG();
 }
-- 
cgit v1.2.3


From a7c55cbee0c1bae9bf5a15a08300e91d88706e45 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Wed, 4 Aug 2010 20:27:05 -0400
Subject: oprofile: add support for Intel processor model 30

Newer Intel processors identifying themselves as model 30 are not recognized by
oprofile.

<cpuinfo snippet>
model           : 30
model name      : Intel(R) Xeon(R) CPU           X3470  @ 2.93GHz
</cpuinfo snippet>

Running oprofile on these machines gives the following:
+ opcontrol --init
+ opcontrol --list-events
oprofile: available events for CPU type "Intel Architectural Perfmon"

See Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3B (Document 253669) Chapter 18 for architectural perfmon events
This is a limited set of fallback events because oprofile doesn't know your CPU
CPU_CLK_UNHALTED: (counter: all)
        Clock cycles when not halted (min count: 6000)
INST_RETIRED: (counter: all)
        number of instructions retired (min count: 6000)
LLC_MISSES: (counter: all)
        Last level cache demand requests from this core that missed the LLC
(min count: 6000)
        Unit masks (default 0x41)
        ----------
        0x41: No unit mask
LLC_REFS: (counter: all)
        Last level cache demand requests from this core (min count: 6000)
        Unit masks (default 0x4f)
        ----------
        0x4f: No unit mask
BR_MISS_PRED_RETIRED: (counter: all)
        number of mispredicted branches retired (precise) (min count: 500)
+ opcontrol --shutdown

Tested using oprofile 0.9.6.

Signed-off-by: Josh Hunt <johunt@akamai.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1ba67dc8006..f6b48f6c595 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -668,6 +668,7 @@ static int __init ppro_init(char **cpu_type)
 		*cpu_type = "i386/core_2";
 		break;
 	case 0x1a:
+	case 0x1e:
 	case 0x2e:
 		spec = &op_arch_perfmon_spec;
 		*cpu_type = "i386/core_i7";
-- 
cgit v1.2.3


From 12bfa3de63504d879ae427ec1f2884fc46556157 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:20 -0500
Subject: kgdb,x86: Individual register get/set for x86

Implement the ability to individually get and set registers for kdb
and kgdb for x86.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: x86@kernel.org
---
 arch/x86/include/asm/kgdb.h |  20 +++---
 arch/x86/kernel/kgdb.c      | 168 ++++++++++++++++++++++----------------------
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cd..396f5b5fc4d 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
 	GDB_FS,			/* 14 */
 	GDB_GS,			/* 15 */
 };
+#define GDB_ORIG_AX		41
+#define DBG_MAX_REG_NUM		16
 #define NUMREGBYTES		((GDB_GS+1)*4)
 #else /* ! CONFIG_X86_32 */
-enum regnames64 {
+enum regnames {
 	GDB_AX,			/* 0 */
 	GDB_BX,			/* 1 */
 	GDB_CX,			/* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
 	GDB_R14,		/* 14 */
 	GDB_R15,		/* 15 */
 	GDB_PC,			/* 16 */
+	GDB_PS,			/* 17 */
+	GDB_CS,			/* 18 */
+	GDB_SS,			/* 19 */
 };
-
-enum regnames32 {
-	GDB_PS = 34,
-	GDB_CS,
-	GDB_SS,
-};
-#define NUMREGBYTES		((GDB_SS+1)*4)
-#endif /* CONFIG_X86_32 */
+#define GDB_ORIG_AX		57
+#define DBG_MAX_REG_NUM		20
+/* 17 64 bit regs and 3 32 bit regs */
+#define NUMREGBYTES		((17 * 8) + (3 * 4))
+#endif /* ! CONFIG_X86_32 */
 
 static inline void arch_kgdb_breakpoint(void)
 {
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae..bae89825e14 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
 #include <asm/system.h>
 #include <asm/apic.h>
 
-/**
- *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
- *	@regs: The &struct pt_regs of the current process.
- *
- *	Convert the pt_regs in @regs into the format for registers that
- *	GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
-#ifndef CONFIG_X86_32
-	u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+	{ "ax", 4, offsetof(struct pt_regs, ax) },
+	{ "cx", 4, offsetof(struct pt_regs, cx) },
+	{ "dx", 4, offsetof(struct pt_regs, dx) },
+	{ "bx", 4, offsetof(struct pt_regs, bx) },
+	{ "sp", 4, offsetof(struct pt_regs, sp) },
+	{ "bp", 4, offsetof(struct pt_regs, bp) },
+	{ "si", 4, offsetof(struct pt_regs, si) },
+	{ "di", 4, offsetof(struct pt_regs, di) },
+	{ "ip", 4, offsetof(struct pt_regs, ip) },
+	{ "flags", 4, offsetof(struct pt_regs, flags) },
+	{ "cs", 4, offsetof(struct pt_regs, cs) },
+	{ "ss", 4, offsetof(struct pt_regs, ss) },
+	{ "ds", 4, offsetof(struct pt_regs, ds) },
+	{ "es", 4, offsetof(struct pt_regs, es) },
+	{ "fs", 4, -1 },
+	{ "gs", 4, -1 },
+#else
+	{ "ax", 8, offsetof(struct pt_regs, ax) },
+	{ "bx", 8, offsetof(struct pt_regs, bx) },
+	{ "cx", 8, offsetof(struct pt_regs, cx) },
+	{ "dx", 8, offsetof(struct pt_regs, dx) },
+	{ "si", 8, offsetof(struct pt_regs, dx) },
+	{ "di", 8, offsetof(struct pt_regs, di) },
+	{ "bp", 8, offsetof(struct pt_regs, bp) },
+	{ "sp", 8, offsetof(struct pt_regs, sp) },
+	{ "r8", 8, offsetof(struct pt_regs, r8) },
+	{ "r9", 8, offsetof(struct pt_regs, r9) },
+	{ "r10", 8, offsetof(struct pt_regs, r10) },
+	{ "r11", 8, offsetof(struct pt_regs, r11) },
+	{ "r12", 8, offsetof(struct pt_regs, r12) },
+	{ "r13", 8, offsetof(struct pt_regs, r13) },
+	{ "r14", 8, offsetof(struct pt_regs, r14) },
+	{ "r15", 8, offsetof(struct pt_regs, r15) },
+	{ "ip", 8, offsetof(struct pt_regs, ip) },
+	{ "flags", 4, offsetof(struct pt_regs, flags) },
+	{ "cs", 4, offsetof(struct pt_regs, cs) },
+	{ "ss", 4, offsetof(struct pt_regs, ss) },
 #endif
-	gdb_regs[GDB_AX]	= regs->ax;
-	gdb_regs[GDB_BX]	= regs->bx;
-	gdb_regs[GDB_CX]	= regs->cx;
-	gdb_regs[GDB_DX]	= regs->dx;
-	gdb_regs[GDB_SI]	= regs->si;
-	gdb_regs[GDB_DI]	= regs->di;
-	gdb_regs[GDB_BP]	= regs->bp;
-	gdb_regs[GDB_PC]	= regs->ip;
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (
 #ifdef CONFIG_X86_32
-	gdb_regs[GDB_PS]	= regs->flags;
-	gdb_regs[GDB_DS]	= regs->ds;
-	gdb_regs[GDB_ES]	= regs->es;
-	gdb_regs[GDB_CS]	= regs->cs;
-	gdb_regs[GDB_FS]	= 0xFFFF;
-	gdb_regs[GDB_GS]	= 0xFFFF;
-	if (user_mode_vm(regs)) {
-		gdb_regs[GDB_SS] = regs->ss;
-		gdb_regs[GDB_SP] = regs->sp;
-	} else {
-		gdb_regs[GDB_SS] = __KERNEL_DS;
-		gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+	    regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+	    regno == GDB_SP || regno == GDB_ORIG_AX)
+		return 0;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno == GDB_ORIG_AX) {
+		memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+		return "orig_ax";
 	}
-#else
-	gdb_regs[GDB_R8]	= regs->r8;
-	gdb_regs[GDB_R9]	= regs->r9;
-	gdb_regs[GDB_R10]	= regs->r10;
-	gdb_regs[GDB_R11]	= regs->r11;
-	gdb_regs[GDB_R12]	= regs->r12;
-	gdb_regs[GDB_R13]	= regs->r13;
-	gdb_regs[GDB_R14]	= regs->r14;
-	gdb_regs[GDB_R15]	= regs->r15;
-	gdb_regs32[GDB_PS]	= regs->flags;
-	gdb_regs32[GDB_CS]	= regs->cs;
-	gdb_regs32[GDB_SS]	= regs->ss;
-	gdb_regs[GDB_SP]	= kernel_stack_pointer(regs);
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+
+	switch (regno) {
+#ifdef CONFIG_X86_32
+	case GDB_SS:
+		if (!user_mode_vm(regs))
+			*(unsigned long *)mem = __KERNEL_DS;
+		break;
+	case GDB_SP:
+		if (!user_mode_vm(regs))
+			*(unsigned long *)mem = kernel_stack_pointer(regs);
+		break;
+	case GDB_GS:
+	case GDB_FS:
+		*(unsigned long *)mem = 0xFFFF;
+		break;
 #endif
+	}
+	return dbg_reg_def[regno].name;
 }
 
 /**
@@ -150,47 +189,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	gdb_regs[GDB_SP]	= p->thread.sp;
 }
 
-/**
- *	gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- *	@gdb_regs: A pointer to hold the registers we've received from GDB.
- *	@regs: A pointer to a &struct pt_regs to hold these values in.
- *
- *	Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- *	in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
-	u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
-	regs->ax		= gdb_regs[GDB_AX];
-	regs->bx		= gdb_regs[GDB_BX];
-	regs->cx		= gdb_regs[GDB_CX];
-	regs->dx		= gdb_regs[GDB_DX];
-	regs->si		= gdb_regs[GDB_SI];
-	regs->di		= gdb_regs[GDB_DI];
-	regs->bp		= gdb_regs[GDB_BP];
-	regs->ip		= gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
-	regs->flags		= gdb_regs[GDB_PS];
-	regs->ds		= gdb_regs[GDB_DS];
-	regs->es		= gdb_regs[GDB_ES];
-	regs->cs		= gdb_regs[GDB_CS];
-#else
-	regs->r8		= gdb_regs[GDB_R8];
-	regs->r9		= gdb_regs[GDB_R9];
-	regs->r10		= gdb_regs[GDB_R10];
-	regs->r11		= gdb_regs[GDB_R11];
-	regs->r12		= gdb_regs[GDB_R12];
-	regs->r13		= gdb_regs[GDB_R13];
-	regs->r14		= gdb_regs[GDB_R14];
-	regs->r15		= gdb_regs[GDB_R15];
-	regs->flags		= gdb_regs32[GDB_PS];
-	regs->cs		= gdb_regs32[GDB_CS];
-	regs->ss		= gdb_regs32[GDB_SS];
-#endif
-}
-
 static struct hw_breakpoint {
 	unsigned		enabled;
 	unsigned long		addr;
-- 
cgit v1.2.3


From 9264b278be42c031dc76517a0d4bb154f5dcf470 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 5 Aug 2010 09:22:24 -0500
Subject: KGDB: Remove set but unused newPC

Found by gcc 4.6's new warnings

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index bae89825e14..a8b80979ceb 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -456,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 {
 	unsigned long addr;
 	char *ptr;
-	int newPC;
 
 	switch (remcomInBuffer[0]) {
 	case 'c':
@@ -467,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 			linux_regs->ip = addr;
 	case 'D':
 	case 'k':
-		newPC = linux_regs->ip;
-
 		/* clear the trace bit */
 		linux_regs->flags &= ~X86_EFLAGS_TF;
 		atomic_set(&kgdb_cpu_doing_single_step, -1);
-- 
cgit v1.2.3


From df4939350b345ebb44937902827aa75b8ad4998c Mon Sep 17 00:00:00 2001
From: Dongdong Deng <dongdong.deng@windriver.com>
Date: Thu, 5 Aug 2010 09:22:25 -0500
Subject: kgdb,x86: use macro HBP_NUM to replace magic number 4

Use the macros provided by the HW breakpoint API.

Signed-off-by: Dongdong Deng <dongdong.deng@windriver.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a8b80979ceb..ef10940e1af 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -195,7 +195,7 @@ static struct hw_breakpoint {
 	int			len;
 	int			type;
 	struct perf_event	**pev;
-} breakinfo[4];
+} breakinfo[HBP_NUM];
 
 static unsigned long early_dr7;
 
@@ -203,7 +203,7 @@ static void kgdb_correct_hw_break(void)
 {
 	int breakno;
 
-	for (breakno = 0; breakno < 4; breakno++) {
+	for (breakno = 0; breakno < HBP_NUM; breakno++) {
 		struct perf_event *bp;
 		struct arch_hw_breakpoint *info;
 		int val;
@@ -290,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
 	int i;
 
-	for (i = 0; i < 4; i++)
+	for (i = 0; i < HBP_NUM; i++)
 		if (breakinfo[i].addr == addr && breakinfo[i].enabled)
 			break;
-	if (i == 4)
+	if (i == HBP_NUM)
 		return -1;
 
 	if (hw_break_release_slot(i)) {
@@ -311,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
 	int cpu = raw_smp_processor_id();
 	struct perf_event *bp;
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -331,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
 	int i;
 
-	for (i = 0; i < 4; i++)
+	for (i = 0; i < HBP_NUM; i++)
 		if (!breakinfo[i].enabled)
 			break;
-	if (i == 4)
+	if (i == HBP_NUM)
 		return -1;
 
 	switch (bptype) {
@@ -395,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 
 	/* Disable hardware debugging while we are in kgdb: */
 	set_debugreg(0UL, 7);
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
 		if (dbg_is_early) {
@@ -640,7 +640,7 @@ void kgdb_arch_late(void)
 	attr.bp_len = HW_BREAKPOINT_LEN_1;
 	attr.bp_type = HW_BREAKPOINT_W;
 	attr.disabled = 1;
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (breakinfo[i].pev)
 			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
-- 
cgit v1.2.3


From 5989cd6a1cbf86587edcc856791f960978087311 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Aug 2010 13:30:27 -0700
Subject: x86, apic: Map the local apic when parsing the MP table.

This fixes a regression in 2.6.35 from 2.6.34, that is
present for select models of Intel cpus when people are
using an MP table.

The commit cf7500c0ea133d66f8449d86392d83f840102632
"x86, ioapic: In mpparse use mp_register_ioapic" started
calling mp_register_ioapic from MP_ioapic_info.  An extremely
simple change that was obviously correct.  Unfortunately
mp_register_ioapic did just a little more than the previous
hand crafted code and so we gained this call path.

The problem call path is:
MP_ioapic_info()
  mp_register_ioapic()
   io_apic_unique_id()
     io_apic_get_unique_id()
       get_physical_broadcast()
         modern_apic()
           lapic_get_version()
             apic_read(APIC_LVR)

Which turned out to be a problem because the local apic
was not mapped, at that point, unlike the similar point
in the ACPI parsing code.

This problem is fixed by mapping the local apic when
parsing the mptable as soon as we reasonably can.

Looking at the number of places we setup the fixmap for
the local apic, I see some serious simplification opportunities.
For the moment except for not duplicating the setting up of the
fixmap in init_apic_mappings, I have not acted on them.

The regression from 2.6.34 is tracked in bug
https://bugzilla.kernel.org/show_bug.cgi?id=16173

Cc: <stable@kernel.org> 2.6.35
Reported-by: David Hill <hilld@binarystorm.net>
Reported-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Tested-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <m1eiee86jg.fsf_-_@fess.ebiederm.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c |  2 +-
 arch/x86/kernel/mpparse.c   | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6ca..c07e51391a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
 		 * acpi lapic path already maps that address in
 		 * acpi_register_lapic_address()
 		 */
-		if (!acpi_lapic)
+		if (!acpi_lapic && !smp_found_config)
 			set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
 		apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54b..d7b6f7fb4fe 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 
 void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
 
+static void __init smp_register_lapic_address(unsigned long address)
+{
+	mp_lapic_addr = address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, address);
+	if (boot_cpu_physical_apicid == -1U) {
+		boot_cpu_physical_apicid  = read_apic_id();
+		apic_version[boot_cpu_physical_apicid] =
+			 GET_APIC_VERSION(apic_read(APIC_LVR));
+	}
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	if (early)
 		return 1;
 
+	/* Initialize the lapic mapping */
+	if (!acpi_lapic)
+		smp_register_lapic_address(mpc->lapic);
+
 	if (mpc->oemptr)
 		x86_init.mpparse.smp_read_mpc_oem(mpc);
 
-- 
cgit v1.2.3


From 7645e4320497b35ce9fb6c2269ebcd57af9fe735 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 6 Aug 2010 12:18:11 -0700
Subject: x86, kvm: Remove cast obsoleted by set_64bit() prototype cleanup

KVM ended up having to put a pretty ugly wrapper around set_64bit()
in order to get the type right.  Now set_64bit() takes the expected
u64 type, and this wrapper can be cleaned up.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
LKML-Reference: <4C5C4E7A.8040603@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/mmu.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0dcc95e0987..311f6dad895 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -281,11 +281,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
 
 static void __set_spte(u64 *sptep, u64 spte)
 {
-#ifdef CONFIG_X86_64
-	set_64bit((unsigned long *)sptep, spte);
-#else
-	set_64bit((unsigned long long *)sptep, spte);
-#endif
+	set_64bit(sptep, spte);
 }
 
 static u64 __xchg_spte(u64 *sptep, u64 new_spte)
-- 
cgit v1.2.3


From 7e005f79791dcd58436c88ded4a7f5aed1b82147 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 31 May 2010 15:59:04 +0900
Subject: remove needless ISA_DMA_THRESHOLD

Architectures don't need to define ISA_DMA_THRESHOLD anymore.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: James Bottomley <James.Bottomley@suse.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 arch/x86/include/asm/scatterlist.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index fb0b1874396..4240878b9d7 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -3,7 +3,6 @@
 
 #include <asm-generic/scatterlist.h>
 
-#define ISA_DMA_THRESHOLD (0x00ffffff)
 #define ARCH_HAS_SG_CHAIN
 
 #endif /* _ASM_X86_SCATTERLIST_H */
-- 
cgit v1.2.3


From 1c250d709fdc8aa5bf42d90be99428a01a256a55 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 5 Aug 2010 19:09:17 +0400
Subject: perf, x86: P4 PMU -- update nmi irq statistics and unmask lvt entry
 properly

In case if last active performance counter is not overflowed at
moment of NMI being triggered by another counter, the irq
statistics may miss an update stage. As a more serious
consequence -- apic quirk may not be triggered so apic lvt entry
stay masked.

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100805150917.GA6311@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 107711bf0ee..febb12cea79 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -656,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int overflow;
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -666,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		WARN_ON_ONCE(hwc->idx != idx);
 
 		/* it might be unflagged overflow */
-		handled = p4_pmu_clear_cccr_ovf(hwc);
+		overflow = p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
 			continue;
 
+		handled += overflow;
+
 		/* event overflow for sure */
 		data.period = event->hw.last_period;
 
@@ -687,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-	return handled;
+	return handled > 0;
 }
 
 /*
-- 
cgit v1.2.3


From 597781f3e51f48ef8e67be772196d9e9673752c4 Mon Sep 17 00:00:00 2001
From: Cesar Eduardo Barros <cesarb@cesarb.net>
Date: Mon, 9 Aug 2010 17:18:32 -0700
Subject: kmap_atomic: make kunmap_atomic() harder to misuse

kunmap_atomic() is currently at level -4 on Rusty's "Hard To Misuse"
list[1] ("Follow common convention and you'll get it wrong"), except in
some architectures when CONFIG_DEBUG_HIGHMEM is set[2][3].

kunmap() takes a pointer to a struct page; kunmap_atomic(), however, takes
takes a pointer to within the page itself.  This seems to once in a while
trip people up (the convention they are following is the one from
kunmap()).

Make it much harder to misuse, by moving it to level 9 on Rusty's list[4]
("The compiler/linker won't let you get it wrong").  This is done by
refusing to build if the type of its first argument is a pointer to a
struct page.

The real kunmap_atomic() is renamed to kunmap_atomic_notypecheck()
(which is what you would call in case for some strange reason calling it
with a pointer to a struct page is not incorrect in your code).

The previous version of this patch was compile tested on x86-64.

[1] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html
[2] In these cases, it is at level 5, "Do it right or it will always
    break at runtime."
[3] At least mips and powerpc look very similar, and sparc also seems to
    share a common ancestor with both; there seems to be quite some
    degree of copy-and-paste coding here. The include/asm/highmem.h file
    for these three archs mention x86 CPUs at its top.
[4] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html
[5] As an aside, could someone tell me why mn10300 uses unsigned long as
    the first parameter of kunmap_atomic() instead of void *?

Signed-off-by: Cesar Eduardo Barros <cesarb@cesarb.net>
Cc: Russell King <linux@arm.linux.org.uk> (arch/arm)
Cc: Ralf Baechle <ralf@linux-mips.org> (arch/mips)
Cc: David Howells <dhowells@redhat.com> (arch/frv, arch/mn10300)
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> (arch/mn10300)
Cc: Kyle McMartin <kyle@mcmartin.ca> (arch/parisc)
Cc: Helge Deller <deller@gmx.de> (arch/parisc)
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> (arch/parisc)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> (arch/powerpc)
Cc: Paul Mackerras <paulus@samba.org> (arch/powerpc)
Cc: "David S. Miller" <davem@davemloft.net> (arch/sparc)
Cc: Thomas Gleixner <tglx@linutronix.de> (arch/x86)
Cc: Ingo Molnar <mingo@redhat.com> (arch/x86)
Cc: "H. Peter Anvin" <hpa@zytor.com> (arch/x86)
Cc: Arnd Bergmann <arnd@arndb.de> (include/asm-generic)
Cc: Rusty Russell <rusty@rustcorp.com.au> ("Hard To Misuse" list)
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/highmem.h | 2 +-
 arch/x86/mm/highmem_32.c       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80..8caac76ac32 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe..5e8fa12ef86 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	return kmap_atomic_prot(page, type, kmap_prot);
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 
-- 
cgit v1.2.3


From 4e60c86bd9e5a7110ed28874d0b6592186550ae8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 9 Aug 2010 17:19:03 -0700
Subject: gcc-4.6: mm: fix unused but set warnings

No real bugs, just some dead code and some fixups.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c61..076052cd62b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
 
-- 
cgit v1.2.3


From d7a7c573936a86474c4a5090a45a4bc6e680c117 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 9 Aug 2010 17:20:33 -0700
Subject: x86, ia64, smp: use workqueues unconditionally during do_boot_cpu()

Workqueues are now initialized as part of the early_initcall().  So they
are available for use during cold boot process aswell.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/smpboot.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 51620953b18..a5e928b0cb5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up())
-		c_idle.work.func(&c_idle.work);
-	else {
-		schedule_work(&c_idle.work);
-		wait_for_completion(&c_idle.done);
-	}
+	schedule_work(&c_idle.work);
+	wait_for_completion(&c_idle.done);
 
 	if (IS_ERR(c_idle.idle)) {
 		printk("failed fork for CPU %d\n", cpu);
-- 
cgit v1.2.3


From 8cbd84f2dd4e52a8771b191030c374ba3e56d291 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 10 Aug 2010 15:35:10 -0700
Subject: x86: fix up system call numbering nit

As pointed out by Jiri Slaby: when I resolved the the 32-bit x85 system
call entry tables for prlimit (due to the conflict with fanotify), I
forgot to add the numbering in comments that we do for every fifth entry.

Reported-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          | 2 +-
 arch/x86/kernel/syscall_table_32.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 91dc4bb1303..b86feabed69 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -844,5 +844,5 @@ ia32_sys_call_table:
 	.quad compat_sys_recvmmsg
 	.quad sys_fanotify_init
 	.quad sys32_fanotify_mark
-	.quad sys_prlimit64
+	.quad sys_prlimit64		/* 340 */
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 4802accb9d8..b35786dc9b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -339,4 +339,4 @@ ENTRY(sys_call_table)
 	.long sys_recvmmsg
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
-	.long sys_prlimit64
+	.long sys_prlimit64		/* 340 */
-- 
cgit v1.2.3


From 8fd49936a8cac246fc9ed85508556c82cd44cf68 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 11 Aug 2010 15:37:41 +0900
Subject: x86: Document __phys_reloc_hide() usage in __pa_symbol()

Until all supported versions of gcc recognize
-fno-strict-overflow, we should keep the RELOC_HIDE() magic in
__pa_symbol(). Comment it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
LKML-Reference: <1281508661-29507-1-git-send-email-namhyung@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/page.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 625c3f0e741..8ca82839288 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 #define __pa_nodebug(x)	__phys_addr_nodebug((unsigned long)(x))
 /* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */
+/*
+ * We need __phys_reloc_hide() here because gcc may assume that there is no
+ * overflow during __pa() calculation and can optimize it unexpectedly.
+ * Newer versions of gcc provide -fno-strict-overflow switch to handle this
+ * case properly. Once all supported versions of gcc understand it, we can
+ * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
+ */
 #define __pa_symbol(x)	__pa(__phys_reloc_hide((unsigned long)(x)))
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-- 
cgit v1.2.3


From 4565f0170dfc849b3629c27d769db800467baa62 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Aug 2010 18:03:22 -0700
Subject: dma-mapping: unify dma_get_cache_alignment implementations

dma_get_cache_alignment returns the minimum DMA alignment.  Architectures
defines it as ARCH_DMA_MINALIGN (formally ARCH_KMALLOC_MINALIGN).  So we
can unify dma_get_cache_alignment implementations.

Note that some architectures implement dma_get_cache_alignment wrongly.
dma_get_cache_alignment() should return the minimum DMA alignment.  So
fully-coherent architectures should return 1.  This patch also fixes this
issue.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/dma-mapping.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed2106..f9c67e83f64 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -87,13 +87,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	flush_write_buffers();
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	/* no easy way to get cache size on all x86, so return the
-	 * maximum possible, to be safe */
-	return boot_cpu_data.x86_clflush_size;
-}
-
 static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
 						    gfp_t gfp)
 {
-- 
cgit v1.2.3


From 3b9c6c11f519718d618f5d7c9508daf78b207f6f Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Aug 2010 18:03:25 -0700
Subject: dma-mapping: remove dma_is_consistent API

Architectures implement dma_is_consistent() in different ways (some
misinterpret the definition of API in DMA-API.txt).  So it hasn't been so
useful for drivers.  We have only one user of the API in tree.  Unlikely
out-of-tree drivers use the API.

Even if we fix dma_is_consistent() in some architectures, it doesn't look
useful at all.  It was invented long ago for some old systems that can't
allocate coherent memory at all.  It's better to export only APIs that are
definitely necessary for drivers.

Let's remove this API.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/dma-mapping.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f9c67e83f64..d4c419f883a 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d, h)	(1)
 
 extern int dma_supported(struct device *hwdev, u64 mask);
 extern int dma_set_mask(struct device *dev, u64 mask);
-- 
cgit v1.2.3


From 30246557a06bb20618bed906a06d1e1e0faa8bb4 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Fri, 6 Aug 2010 04:04:38 +0200
Subject: x86, asm: Refactor atomic64_386_32.S to support old binutils and be
 cleaner

The old code didn't work on binutils 2.12 because setting a symbol to
a register apparently requires a fairly recent version.

This commit refactors the code to use the C preprocessor instead, and
in the process makes the whole code a bit easier to understand.

The object code produced is unchanged as expected.

This fixes kernel bugzilla 16506.

Reported-by: Dieter Stussy <kd6lvw+software@kd6lvw.ampr.org>
Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org> 2.6.35
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/lib/atomic64_386_32.S | 236 ++++++++++++++++++++++-------------------
 1 file changed, 128 insertions(+), 108 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 4a5979aa688..78ee8e0fa27 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -25,150 +25,170 @@
 	CFI_ADJUST_CFA_OFFSET -4
 .endm
 
-.macro BEGIN func reg
-$v = \reg
-
-ENTRY(atomic64_\func\()_386)
-	CFI_STARTPROC
-	LOCK $v
-
-.macro RETURN
-	UNLOCK $v
+#define BEGIN(op) \
+.macro END; \
+	CFI_ENDPROC; \
+ENDPROC(atomic64_##op##_386); \
+.purgem END; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+	CFI_STARTPROC; \
+	LOCK v;
+
+#define RET \
+	UNLOCK v; \
 	ret
-.endm
-
-.macro END_
-	CFI_ENDPROC
-ENDPROC(atomic64_\func\()_386)
-.purgem RETURN
-.purgem END_
-.purgem END
-.endm
-
-.macro END
-RETURN
-END_
-.endm
-.endm
-
-BEGIN read %ecx
-	movl  ($v), %eax
-	movl 4($v), %edx
-END
-
-BEGIN set %esi
-	movl %ebx,  ($v)
-	movl %ecx, 4($v)
-END
-
-BEGIN xchg %esi
-	movl  ($v), %eax
-	movl 4($v), %edx
-	movl %ebx,  ($v)
-	movl %ecx, 4($v)
-END
-
-BEGIN add %ecx
-	addl %eax,  ($v)
-	adcl %edx, 4($v)
-END
 
-BEGIN add_return %ecx
-	addl  ($v), %eax
-	adcl 4($v), %edx
-	movl %eax,  ($v)
-	movl %edx, 4($v)
-END
-
-BEGIN sub %ecx
-	subl %eax,  ($v)
-	sbbl %edx, 4($v)
-END
-
-BEGIN sub_return %ecx
+#define RET_END \
+	RET; \
+	END
+
+#define v %ecx
+BEGIN(read)
+	movl  (v), %eax
+	movl 4(v), %edx
+RET_END
+#undef v
+
+#define v %esi
+BEGIN(set)
+	movl %ebx,  (v)
+	movl %ecx, 4(v)
+RET_END
+#undef v
+
+#define v  %esi
+BEGIN(xchg)
+	movl  (v), %eax
+	movl 4(v), %edx
+	movl %ebx,  (v)
+	movl %ecx, 4(v)
+RET_END
+#undef v
+
+#define v %ecx
+BEGIN(add)
+	addl %eax,  (v)
+	adcl %edx, 4(v)
+RET_END
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+	addl  (v), %eax
+	adcl 4(v), %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_END
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+	subl %eax,  (v)
+	sbbl %edx, 4(v)
+RET_END
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
 	negl %edx
 	negl %eax
 	sbbl $0, %edx
-	addl  ($v), %eax
-	adcl 4($v), %edx
-	movl %eax,  ($v)
-	movl %edx, 4($v)
-END
-
-BEGIN inc %esi
-	addl $1,  ($v)
-	adcl $0, 4($v)
-END
-
-BEGIN inc_return %esi
-	movl  ($v), %eax
-	movl 4($v), %edx
+	addl  (v), %eax
+	adcl 4(v), %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_END
+#undef v
+
+#define v %esi
+BEGIN(inc)
+	addl $1,  (v)
+	adcl $0, 4(v)
+RET_END
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+	movl  (v), %eax
+	movl 4(v), %edx
 	addl $1, %eax
 	adcl $0, %edx
-	movl %eax,  ($v)
-	movl %edx, 4($v)
-END
-
-BEGIN dec %esi
-	subl $1,  ($v)
-	sbbl $0, 4($v)
-END
-
-BEGIN dec_return %esi
-	movl  ($v), %eax
-	movl 4($v), %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_END
+#undef v
+
+#define v %esi
+BEGIN(dec)
+	subl $1,  (v)
+	sbbl $0, 4(v)
+RET_END
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+	movl  (v), %eax
+	movl 4(v), %edx
 	subl $1, %eax
 	sbbl $0, %edx
-	movl %eax,  ($v)
-	movl %edx, 4($v)
-END
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_END
+#undef v
 
-BEGIN add_unless %ecx
+#define v %ecx
+BEGIN(add_unless)
 	addl %eax, %esi
 	adcl %edx, %edi
-	addl  ($v), %eax
-	adcl 4($v), %edx
+	addl  (v), %eax
+	adcl 4(v), %edx
 	cmpl %eax, %esi
 	je 3f
 1:
-	movl %eax,  ($v)
-	movl %edx, 4($v)
+	movl %eax,  (v)
+	movl %edx, 4(v)
 	movl $1, %eax
 2:
-RETURN
+	RET
 3:
 	cmpl %edx, %edi
 	jne 1b
 	xorl %eax, %eax
 	jmp 2b
-END_
+END
+#undef v
 
-BEGIN inc_not_zero %esi
-	movl  ($v), %eax
-	movl 4($v), %edx
+#define v %esi
+BEGIN(inc_not_zero)
+	movl  (v), %eax
+	movl 4(v), %edx
 	testl %eax, %eax
 	je 3f
 1:
 	addl $1, %eax
 	adcl $0, %edx
-	movl %eax,  ($v)
-	movl %edx, 4($v)
+	movl %eax,  (v)
+	movl %edx, 4(v)
 	movl $1, %eax
 2:
-RETURN
+	RET
 3:
 	testl %edx, %edx
 	jne 1b
 	jmp 2b
-END_
+END
+#undef v
 
-BEGIN dec_if_positive %esi
-	movl  ($v), %eax
-	movl 4($v), %edx
+#define v %esi
+BEGIN(dec_if_positive)
+	movl  (v), %eax
+	movl 4(v), %edx
 	subl $1, %eax
 	sbbl $0, %edx
 	js 1f
-	movl %eax,  ($v)
-	movl %edx, 4($v)
+	movl %eax,  (v)
+	movl %edx, 4(v)
 1:
-END
+RET_END
+#undef v
-- 
cgit v1.2.3


From 417484d47e115774745ef025bce712a102b6f86f Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Thu, 12 Aug 2010 07:00:35 -0700
Subject: x86, asm: Use a lower case name for the end macro in
 atomic64_386_32.S

Use a lowercase name for the end macro, which somehow fixes a binutils 2.16
problem.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <tip-30246557a06bb20618bed906a06d1e1e0faa8bb4@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/atomic64_386_32.S | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 78ee8e0fa27..2cda60a06e6 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -26,35 +26,37 @@
 .endm
 
 #define BEGIN(op) \
-.macro END; \
+.macro endp; \
 	CFI_ENDPROC; \
 ENDPROC(atomic64_##op##_386); \
-.purgem END; \
+.purgem endp; \
 .endm; \
 ENTRY(atomic64_##op##_386); \
 	CFI_STARTPROC; \
 	LOCK v;
 
+#define ENDP endp
+
 #define RET \
 	UNLOCK v; \
 	ret
 
-#define RET_END \
+#define RET_ENDP \
 	RET; \
-	END
+	ENDP
 
 #define v %ecx
 BEGIN(read)
 	movl  (v), %eax
 	movl 4(v), %edx
-RET_END
+RET_ENDP
 #undef v
 
 #define v %esi
 BEGIN(set)
 	movl %ebx,  (v)
 	movl %ecx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v  %esi
@@ -63,14 +65,14 @@ BEGIN(xchg)
 	movl 4(v), %edx
 	movl %ebx,  (v)
 	movl %ecx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %ecx
 BEGIN(add)
 	addl %eax,  (v)
 	adcl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %ecx
@@ -79,14 +81,14 @@ BEGIN(add_return)
 	adcl 4(v), %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %ecx
 BEGIN(sub)
 	subl %eax,  (v)
 	sbbl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %ecx
@@ -98,14 +100,14 @@ BEGIN(sub_return)
 	adcl 4(v), %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %esi
 BEGIN(inc)
 	addl $1,  (v)
 	adcl $0, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %esi
@@ -116,14 +118,14 @@ BEGIN(inc_return)
 	adcl $0, %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %esi
 BEGIN(dec)
 	subl $1,  (v)
 	sbbl $0, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %esi
@@ -134,7 +136,7 @@ BEGIN(dec_return)
 	sbbl $0, %edx
 	movl %eax,  (v)
 	movl %edx, 4(v)
-RET_END
+RET_ENDP
 #undef v
 
 #define v %ecx
@@ -156,7 +158,7 @@ BEGIN(add_unless)
 	jne 1b
 	xorl %eax, %eax
 	jmp 2b
-END
+ENDP
 #undef v
 
 #define v %esi
@@ -177,7 +179,7 @@ BEGIN(inc_not_zero)
 	testl %edx, %edx
 	jne 1b
 	jmp 2b
-END
+ENDP
 #undef v
 
 #define v %esi
@@ -190,5 +192,5 @@ BEGIN(dec_if_positive)
 	movl %eax,  (v)
 	movl %edx, 4(v)
 1:
-RET_END
+RET_ENDP
 #undef v
-- 
cgit v1.2.3


From a3da323420d5aa6f7bd15efc7bf34cd6d19e1f1a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 8 Aug 2010 02:37:23 +0900
Subject: [CPUFREQ] add missing __percpu markup in pcc-cpufreq.c

pcc_cpu_info is a percpu pointer but was missing __percpu markup.
Add it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb62..994230d4dc4 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
 	u32 output_offset;
 };
 
-static struct pcc_cpu *pcc_cpu_info;
+static struct pcc_cpu __percpu *pcc_cpu_info;
 
 static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
 {
-- 
cgit v1.2.3


From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 9 Aug 2010 14:20:10 -0700
Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods

The current computation, introduced with f12a15be63, of FSEC_PER_SEC using
the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only
with 32bit integers on small machines, resulting in an overflow and a
*very* short intervals being programmed.  An interrupt storm follows.

Note that we also have to specify FSEC_PER_SEC as being long long to
overcome the same limitations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 33dbcc4ec5f..351f9c0fea1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	 * scaled math multiplication factor for nanosecond to hpet tick
 	 * conversion.
 	 */
-	hpet_freq = 1000000000000000ULL;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	evt->mult = div_sc((unsigned long) hpet_freq,
 				      NSEC_PER_SEC, evt->shift);
@@ -837,7 +837,7 @@ static int hpet_clocksource_register(void)
 	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
 	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
 	 */
-	hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
 
-- 
cgit v1.2.3


From 1d6225e8cc5598f2bc5c992f9c88b1137763e8e1 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 9 Aug 2010 16:11:22 -0500
Subject: x86, UV: Make kdump avoid stack dumps - fix !CONFIG_KEXEC breakage

This replaces Version 1 of this patch, which broke the build when
CONFIG_KEXEC and CONFIG_CRASH_DUMP were configured off.  In that case
the storage for the 'in_crash_kexec' flag was never built.

This version defines that flag as 0 if CONFIG_KEXEC is not set.
The patch is tested with all combinations of those two options.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <E1OiZcw-0001Hb-2g@eag09.americas.sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/kdebug.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 7a2910b3512..5bdfca86581 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,6 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all);
 extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
+#ifdef CONFIG_KEXEC
 extern int in_crash_kexec;
+#else
+/* no crash dump is ever in progress if no crash kernel can be kexec'd */
+#define in_crash_kexec 0
+#endif
 
 #endif /* _ASM_X86_KDEBUG_H */
-- 
cgit v1.2.3


From 3f6c4df7e1f05f2fdb3f20cac9c03f595a72b45d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 13 Aug 2010 23:00:11 +0900
Subject: [CPUFREQ] acpi-cpufreq: add missing __percpu markup

acpi_perf_data is a percpu pointer but was missing __percpu markup.
Add it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 246cd3afbb5..cd8da247dda 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data {
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
 /* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
 
 static struct cpufreq_driver acpi_cpufreq_driver;
 
-- 
cgit v1.2.3


From 96054569190bdec375fe824e48ca1f4e3b53dd36 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 13 Aug 2010 09:49:20 -0700
Subject: x86: don't send SIGBUS for kernel page faults

It's wrong for several reasons, but the most direct one is that the
fault may be for the stack accesses to set up a previous SIGBUS.  When
we have a kernel exception, the kernel exception handler does all the
fixups, not some user-level signal handler.

Even apart from the nested SIGBUS issue, it's also wrong to give out
kernel fault addresses in the signal handler info block, or to send a
SIGBUS when a system call already returns EFAULT.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f62777940df..4c4508e8a20 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	up_read(&mm->mmap_sem);
 
 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER))
+	if (!(error_code & PF_USER)) {
 		no_context(regs, error_code, address);
+		return;
+	}
 
 	/* User-space => ok to do another page fault: */
 	if (is_prefetch(regs, error_code, address))
-- 
cgit v1.2.3


From c7887325230aec47d47a32562a6e26014a0fafca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Aug 2010 11:26:22 +0100
Subject: Mark arguments to certain syscalls as being const

Mark arguments to certain system calls as being const where they should be but
aren't.  The list includes:

 (*) The filename arguments of various stat syscalls, execve(), various utimes
     syscalls and some mount syscalls.

 (*) The filename arguments of some syscall helpers relating to the above.

 (*) The buffer argument of various write syscalls.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/sys_ia32.c        | 14 +++++++-------
 arch/x86/include/asm/sys_ia32.h | 12 ++++++------
 arch/x86/include/asm/syscalls.h |  2 +-
 arch/x86/kernel/entry_64.S      |  4 ++--
 arch/x86/kernel/process.c       |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 3d093311d5e..849813f398e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
 #define AA(__x)		((unsigned long)(__x))
 
 
-asmlinkage long sys32_truncate64(char __user *filename,
+asmlinkage long sys32_truncate64(const char __user *filename,
 				 unsigned long offset_low,
 				 unsigned long offset_high)
 {
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 	return 0;
 }
 
-asmlinkage long sys32_stat64(char __user *filename,
+asmlinkage long sys32_stat64(const char __user *filename,
 			     struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
 	return ret;
 }
 
-asmlinkage long sys32_lstat64(char __user *filename,
+asmlinkage long sys32_lstat64(const char __user *filename,
 			      struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 }
 
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
 			      struct stat64 __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
 			 ((loff_t)AA(poshi) << 32) | AA(poslo));
 }
 
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
-			     u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
+			     u32 count, u32 poslo, u32 poshi)
 {
 	return sys_pwrite64(fd, ubuf, count,
 			  ((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
 	return ret;
 }
 
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp, struct pt_regs *regs)
 {
 	long error;
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index cf4e2e381cb..cb238526a9f 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
 #include <asm/ia32.h>
 
 /* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
 asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
 
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
 asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
+asmlinkage long sys32_fstatat(unsigned int, const char __user *,
 			      struct stat64 __user *, int);
 struct mmap_arg_struct32;
 asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
 asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
 
 asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 asmlinkage long sys32_personality(unsigned long);
 asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
 
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
 			     compat_uptr_t __user *, struct pt_regs *);
 asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
 
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a..feb2ff9bfc2 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *);
 /* kernel/process.c */
 int sys_fork(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
-long sys_execve(char __user *, char __user * __user *,
+long sys_execve(const char __user *, char __user * __user *,
 		char __user * __user *, struct pt_regs *);
 long sys_clone(unsigned long, unsigned long, void __user *,
 	       void __user *, struct pt_regs *);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c5ea5cdbe7b..17be5ec7cbb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
  * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
  *
  * C extern interface:
- *	 extern long execve(char *name, char **argv, char **envp)
+ *	 extern long execve(const char *name, char **argv, char **envp)
  *
  * asm input arguments:
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
  *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d401f1d2d06..64ecaf0af9a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread);
 /*
  * sys_execve() executes a new program.
  */
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
 		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
-- 
cgit v1.2.3