From e40b17208b6805be50ffe891878662b6076206b9 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:03 -0500 Subject: x86: Move notify_die from nmi.c to traps.c In order to handle a new nmi_watchdog approach, I need to move the notify_die() routine out of nmi_watchdog_tick() and into default_do_nmi(). This lets me easily swap out the old nmi_watchdog with the new one with just a config change. The change probably makes sense from a high level perspective because the nmi_watchdog shouldn't be handling notify_die routines anyway. However, this move does change the semantics a little bit. Instead of checking on every nmi interrupt if the cpus are stuck, only check them on the nmi_watchdog interrupts. v2: Move notify_die call into #idef block Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 7 ------- arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396c..5d47682f580 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -400,13 +400,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) int cpu = smp_processor_id(); int rc = 0; - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - sum = get_timer_irqs(cpu); if (__get_cpu_var(nmi_touch)) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1168e445418..51ef893ffa6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -400,7 +400,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + #ifdef CONFIG_X86_LOCAL_APIC + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; + /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. -- cgit v1.2.3 From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:04 -0500 Subject: nmi_watchdog: Add new, generic implementation, using perf events This is a new generic nmi_watchdog implementation using the perf events infrastructure as suggested by Ingo. The implementation is simple, just create an in-kernel perf event and register an overflow handler to check for cpu lockups. I created a generic implementation that lives in kernel/ and the hardware specific part that for now lives in arch/x86. This approach has a number of advantages: - It simplifies the x86 PMU implementation in the long run, in that it removes the hardcoded low-level PMU implementation that was the NMI watchdog before. - It allows new NMI watchdog features to be added in a central place. - It allows other architectures to enable the NMI watchdog, as long as they have perf events (that provide NMIs) implemented. - It also allows for more graceful co-existence of existing perf events apps and the NMI watchdog - before these changes the relationship was exclusive. (The NMI watchdog will 'spend' a perf event when enabled. In later iterations we might be able to piggyback from an existing NMI event without having to allocate a hardware event for the NMI watchdog - turning this into a no-hardware-cost feature.) As for compatibility, we'll keep the old NMI watchdog code as well until the new one can 100% replace it on all CPUs, old and new alike. That might take some time as the NMI watchdog has been ported to many CPU models. I have done light testing to make sure the framework works correctly and it does. v2: Set the correct timeout values based on the old nmi watchdog Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 114 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 arch/x86/kernel/apic/hw_nmi.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 00000000000..8c0e6a410d0 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,114 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +static DEFINE_PER_CPU(unsigned, last_irq_sum); + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +int hw_nmi_is_cpu_stuck(struct pt_regs *regs) +{ + unsigned int sum; + int cpu = smp_processor_id(); + + /* FIXME: cheap hack for this check, probably should get its own + * die_notifier handler + */ + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + } + + /* if we are doing an mce, just assume the cpu is not stuck */ + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; + + /* We determine if the cpu is stuck by checking whether any + * interrupts have happened since we last checked. Of course + * an nmi storm could create false positives, but the higher + * level logic should account for that + */ + sum = get_timer_irqs(cpu); + if (__get_cpu_var(last_irq_sum) == sum) { + return 1; + } else { + __get_cpu_var(last_irq_sum) = sum; + return 0; + } +} + +void arch_trigger_all_cpu_backtrace(void) +{ + int i; + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + } +} + +/* STUB calls to mimic old nmi_watchdog behaviour */ +unsigned int nmi_watchdog = NMI_NONE; +EXPORT_SYMBOL(nmi_watchdog); +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +EXPORT_SYMBOL(nmi_active); +int nmi_watchdog_enabled; +int unknown_nmi_panic; +void cpu_nmi_set_wd_enabled(void) { return; } +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +void stop_apic_nmi_watchdog(void *unused) { return; } +void setup_apic_nmi_watchdog(void *unused) { return; } +int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.2.3 From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:05 -0500 Subject: nmi_watchdog: Config option to enable new nmi_watchdog These are the bits that enable the new nmi_watchdog and safely isolate the old nmi_watchdog. Only one or the other can run, not both at the same time. Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 7 ++++++- arch/x86/kernel/traps.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507..1a4512e48d2 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_NMI_WATCHDOG),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 51ef893ffa6..973cbc4f044 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; +#ifndef CONFIG_NMI_WATCHDOG /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_NMI_WATCHDOG */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.2.3 From c3128fb6ad39b0edda6675d20585a64846cf89ea Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:18 -0500 Subject: nmi_watchdog: Use a boolean config flag for compiling Determines if an arch has setup arch specific perf_events and nmi_watchdog code. This should restrict compiles to only those arches ready. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbcbfdee3ee..4f9685fa3a3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -52,6 +52,7 @@ config X86 select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS + select PERF_EVENTS_NMI select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER -- cgit v1.2.3 From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:19 -0500 Subject: nmi_watchdog: Compile and portability fixes The original patch was x86_64 centric. Changed the code to make it less so. ested by building and running on a powerpc. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 93da9c3f334..5b41b0feb6d 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); +#if !defined(CONFIG_NMI_WATCHDOG) extern int nmi_watchdog_enabled; +#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c0e6a410d0..312d772c5c3 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; + +#if defined(CONFIG_X86_LOCAL_APIC) + irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif + + return irqs; } static inline int mce_in_progress(void) @@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } } +u64 hw_nmi_get_sample_period(void) +{ + return cpu_khz * 1000; +} + void arch_trigger_all_cpu_backtrace(void) { int i; @@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void) } /* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -int nmi_watchdog_enabled; int unknown_nmi_panic; void cpu_nmi_set_wd_enabled(void) { return; } -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } void stop_apic_nmi_watchdog(void *unused) { return; } void setup_apic_nmi_watchdog(void *unused) { return; } int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.2.3 From 2cc4452bc31fc1cde6f0b64a4eb13269f982787d Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 18 Feb 2010 21:56:52 -0500 Subject: nmi_watchdog: Fix undefined 'apic' build bug Ingo provided me a config that fails to compile with: arch/x86/built-in.o: In function `arch_trigger_all_cpu_backtrace': (.text+0x17e78): undefined reference to `apic' make: *** [.tmp_vmlinux1] Error 1 I realized I changed the compile behaviour of the nmi code by not wrapping it with CONFIG_LOCAL_APIC. To fix this I add a compile check for ARCH_HAS_NMI_WATCHDOG around arch_trigger_all_cpu_backtrace. Signed-off-by: Don Zickus Cc: a.p.zijlstra@chello.nl Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266548212-24243-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 312d772c5c3..0b4d205a6b8 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -92,6 +92,7 @@ u64 hw_nmi_get_sample_period(void) return cpu_khz * 1000; } +#ifdef ARCH_HAS_NMI_WATCHDOG void arch_trigger_all_cpu_backtrace(void) { int i; @@ -108,6 +109,7 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } +#endif /* STUB calls to mimic old nmi_watchdog behaviour */ #if defined(CONFIG_X86_LOCAL_APIC) -- cgit v1.2.3 From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Feb 2010 18:09:03 -0500 Subject: nmi_watchdog: Clean up various small details Mostly copy/paste whitespace damage with a couple of nitpicks by the checkpatch script. Fix the struct definition as requested by Ingo too. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/apic/hw_nmi.c | 14 +++++------ arch/x86/kernel/traps.c | 6 ++-- include/linux/nmi.h | 2 - kernel/nmi_watchdog.c | 51 ++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) --- arch/x86/kernel/apic/hw_nmi.c | 14 +++++++------- arch/x86/kernel/traps.c | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 0b4d205a6b8..e8b78a0be5d 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu) irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; #endif - return irqs; + return irqs; } static inline int mce_in_progress(void) { #if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; + return atomic_read(&mce_entry) > 0; #endif - return 0; + return 0; } int hw_nmi_is_cpu_stuck(struct pt_regs *regs) @@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; /* We determine if the cpu is stuck by checking whether any * interrupts have happened since we last checked. Of course @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return cpu_khz * 1000; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 973cbc4f044..bdc7fab3ef3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; #ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifndef CONFIG_NMI_WATCHDOG /* -- cgit v1.2.3 From 45c34e05c4e3d36e7c44e790241ea11a1d90d54e Mon Sep 17 00:00:00 2001 From: John Villalovos Date: Fri, 7 May 2010 12:41:40 -0400 Subject: Oprofile: Change CPUIDS from decimal to hex, and add some comments Back when the patch was submitted for "Add Xeon 7500 series support to oprofile", Robert Richter had asked for a followon patch that converted all the CPU ID values to hex. I have done that here for the "i386/core_i7" and "i386/atom" class processors in the ppro_init() function and also added some comments on where to find documentation on the Intel processors. Signed-off-by: John L. Villalovos Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b28d2f1253b..1ba67dc8006 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -634,6 +634,18 @@ static int __init ppro_init(char **cpu_type) if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; + /* + * Documentation on identifying Intel processors by CPU family + * and model can be found in the Intel Software Developer's + * Manuals (SDM): + * + * http://www.intel.com/products/processor/manuals/ + * + * As of May 2010 the documentation for this was in the: + * "Intel 64 and IA-32 Architectures Software Developer's + * Manual Volume 3B: System Programming Guide", "Table B-1 + * CPUID Signature Values of DisplayFamily_DisplayModel". + */ switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; @@ -655,12 +667,12 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 0x1a: case 0x2e: - case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; - case 28: + case 0x1c: *cpu_type = "i386/atom"; break; default: -- cgit v1.2.3 From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:44 -0400 Subject: lockup_detector: Combine nmi_watchdog and softlockup detector The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/nmi.h | 2 +- arch/x86/kernel/apic/Makefile | 4 ++-- arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5b41b0feb6d..932f0f86b4b 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); -#if !defined(CONFIG_NMI_WATCHDOG) +#if !defined(CONFIG_LOCKUP_DETECTOR) extern int nmi_watchdog_enabled; #endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 1a4512e48d2..52f32e0ea19 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_NMI_WATCHDOG),y) +ifneq ($(CONFIG_LOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o +obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index e8b78a0be5d..79425f96fce 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bdc7fab3ef3..bd347c2b34d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_NMI_WATCHDOG */ +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.2.3 From 7cbb7e7fa46f6e5229438ac9e4a5c72ec0d53e0b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:48 -0400 Subject: x86: Move trigger_all_cpu_backtrace to its own die_notifier As part of the transition of the nmi watchdog to something more generic, the trigger_all_cpu_backtrace code is getting left behind. Put it in its own die_notifier so it can still be used. V2: - use arch_spin_locks Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-6-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79425f96fce..8c3edfb89c2 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,6 +17,10 @@ #include #include #include +#include +#include +#include + #include #include @@ -54,20 +58,6 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) unsigned int sum; int cpu = smp_processor_id(); - /* FIXME: cheap hack for this check, probably should get its own - * die_notifier handler - */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - } - /* if we are doing an mce, just assume the cpu is not stuck */ /* Could check oops_in_progress here too, but it's safer not to */ if (mce_in_progress()) @@ -109,6 +99,53 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } + +static int __kprobes +arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + int cpu = smp_processor_id(); + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + + arch_spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + arch_spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static __read_mostly struct notifier_block backtrace_notifier = { + .notifier_call = arch_trigger_all_cpu_backtrace_handler, + .next = NULL, + .priority = 1 +}; + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_die_notifier(&backtrace_notifier); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); #endif /* STUB calls to mimic old nmi_watchdog behaviour */ -- cgit v1.2.3 From 10f9014912a2b1cb59c39cdea777e6d9afa8f17e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:49 -0400 Subject: x86: Cleanup hw_nmi.c cruft The design of the hardlockup watchdog has changed and cruft was left behind in the hw_nmi.c file. Just remove the code that isn't used anymore. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-7-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 58 ------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c3edfb89c2..3b40082f037 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,74 +9,16 @@ * */ -#include -#include #include -#include -#include -#include -#include -#include #include #include #include - - #include #include /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static DEFINE_PER_CPU(unsigned, last_irq_sum); - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; - -#if defined(CONFIG_X86_LOCAL_APIC) - irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif - - return irqs; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -int hw_nmi_is_cpu_stuck(struct pt_regs *regs) -{ - unsigned int sum; - int cpu = smp_processor_id(); - - /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; - - /* We determine if the cpu is stuck by checking whether any - * interrupts have happened since we last checked. Of course - * an nmi storm could create false positives, but the higher - * level logic should account for that - */ - sum = get_timer_irqs(cpu); - if (__get_cpu_var(last_irq_sum) == sum) { - return 1; - } else { - __get_cpu_var(last_irq_sum) = sum; - return 0; - } -} - u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; -- cgit v1.2.3 From 5e85391b3badd3f0e50ebdd0cafe0202a979f73a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 May 2010 09:12:39 +0200 Subject: x86, watchdog: Fix build error in hw_nmi.c On some configs the following build error triggers: arch/x86/kernel/apic/hw_nmi.c:35: error: 'apic' undeclared (first use in this function) arch/x86/kernel/apic/hw_nmi.c:35: error: (Each undeclared identifier is reported only once arch/x86/kernel/apic/hw_nmi.c:35: error: for each function it appears in.) Because asm/apic.h was only included implicitly. Include it explicitly. Cc: Frederic Weisbecker Cc: Don Zickus Cc: Peter Zijlstra Cc: Cyrill Gorcunov LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 3b40082f037..cefd6942f0e 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -8,6 +8,7 @@ * Bits copied from original nmi.c file * */ +#include #include #include -- cgit v1.2.3 From c01d4323309a90a298fd81cf3a059ee1b12be2e9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 May 2010 22:57:48 +0200 Subject: lockup_detector: Adapt CONFIG_PERF_EVENT_NMI to other archs CONFIG_PERF_EVENT_NMI is something that need to be enabled from the arch. This is fine on x86 as PERF_EVENTS is builtin but if other archs select it, they will need to handle the PERF_EVENTS dependency. Instead, handle the dependency in the generic layer: - archs need to tell what they support through HAVE_PERF_EVENTS_NMI - Enable magically PERF_EVENTS_NMI if we have PERF_EVENTS and HAVE_PERF_EVENTS_NMI. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3cb28cd1f55..3cb5bb02172 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -54,7 +54,7 @@ config X86 select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS - select PERF_EVENTS_NMI + select HAVE_PERF_EVENTS_NMI select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER -- cgit v1.2.3 From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 14 May 2010 11:11:21 -0400 Subject: lockup_detector: Cross arch compile fixes Combining the softlockup and hardlockup code causes watchdog.c to build even without the hardlockup detection support. So if an arch, that has the previous and the new nmi watchdog implementations cohabiting, wants to know if the generic one is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check. We need to use CONFIG_HARDLOCKUP_DETECTOR instead. Fixes: kernel/built-in.o: In function `touch_nmi_watchdog': (.text+0x449bc): multiple definition of `touch_nmi_watchdog' arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <20100514151121.GR15159@redhat.com> [ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI] Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52f32e0ea19..910f20b457c 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_LOCKUP_DETECTOR),y) +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o -- cgit v1.2.3 From 1dedefd1a066a795a87afca9c0236e1a94de9bf6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:23 -0700 Subject: x86: detect scattered cpuid features earlier Some extra CPU features such as ARAT is needed in early boot so that x86_init function pointers can be set up properly. http://lkml.org/lkml/2010/5/18/519 At start_kernel() level, this patch moves init_scattered_cpuid_features() from check_bugs() to setup_arch() -> early_cpu_init() which is earlier than platform specific x86_init layer setup. Suggested by HPA. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-2-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1c00d0b169..284bf89ddae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } -- cgit v1.2.3 From a0c173bd8a3fd0541be8e4ef962170e48d8811c7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:24 -0700 Subject: x86, mrst: add cpu type detection Medfield is the follow-up of Moorestown, it is treated under the same HW sub-architecture. However, we do need to know the CPU type in order for some of the driver to act accordingly. We also have different optimal clock configuration for each CPU type. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 19 +++++++++++++++++++ arch/x86/kernel/mrst.c | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 451d30e7f62..dc5c8500bfc 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,8 +11,27 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +extern int mrst_identify_cpu(void); int __init sfi_parse_mrtc(struct sfi_table_header *table); +/* + * Medfield is the follow-up of Moorestown, it combines two chip solution into + * one. Other than that it also added always-on and constant tsc and lapic + * timers. Medfield is the platform name, and the chip name is called Penwell + * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be + * identified via MSRs. + */ +enum mrst_cpu_type { + MRST_CPU_CHIP_LINCROFT = 1, + MRST_CPU_CHIP_PENWELL, +}; + +enum mrst_timer_options { + MRST_TIMER_DEFAULT, + MRST_TIMER_APBT_ONLY, + MRST_TIMER_LAPIC_APBT, +}; + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index e796448f0eb..ceaebeb5866 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -27,6 +27,8 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +static int mrst_cpu_chip; + int sfi_mtimer_num; struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; @@ -216,6 +218,28 @@ static void __init mrst_setup_boot_clock(void) setup_boot_APIC_clock(); }; +int mrst_identify_cpu(void) +{ + return mrst_cpu_chip; +} +EXPORT_SYMBOL_GPL(mrst_identify_cpu); + +void __cpuinit mrst_arch_setup(void) +{ + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + else { + pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + } + pr_debug("Moorestown CPU %s identified\n", + (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + "Lincroft" : "Penwell"); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -230,6 +254,8 @@ void __init x86_mrst_early_setup(void) x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.oem.arch_setup = mrst_arch_setup; + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; -- cgit v1.2.3 From a875c01944f0d750eeb1ef3133feceb13f13c4b3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:25 -0700 Subject: x86, mrst: add more timer config options Always-on local APIC timer (ARAT) has been introduced to Medfield, along with the platform APB timers we have more timer configuration options between Moorestown and Medfield. This patch adds run-time detection of avaiable timer features so that we can treat Medfield as a variant of Moorestown and set up the optimal timer options for each platform. i.e. Medfield: per cpu always-on local APIC timer Moorestown: per cpu APB timer Manual override is possible via cmdline option x86_mrst_timer. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/apb_timer.h | 1 - arch/x86/include/asm/mrst.h | 1 + arch/x86/kernel/apb_timer.c | 37 ++++------------- arch/x86/kernel/mrst.c | 88 ++++++++++++++++++++++++++++------------ 4 files changed, 72 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index c74a2eebe57..a69b1ac9eaf 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); extern unsigned int boot_cpu_id; -extern int disable_apbt_percpu; extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index dc5c8500bfc..67ad3154577 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -12,6 +12,7 @@ #define _ASM_X86_MRST_H extern int pci_mrst_init(void); extern int mrst_identify_cpu(void); +extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); /* diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d3..8dd77800ff5 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,10 +43,11 @@ #include #include +#include #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 #define APBT_MIN_DELTA_USEC 200 @@ -83,8 +84,6 @@ struct apbt_dev { char name[10]; }; -int disable_apbt_percpu __cpuinitdata; - static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); #ifdef CONFIG_SMP @@ -194,29 +193,6 @@ static struct clock_event_device apbt_clockevent = { .rating = APBT_CLOCKEVENT_RATING, }; -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - /* * start count down from 0xffff_ffff. this is done by toggling the enable bit * then load initial load count to ~0. @@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", @@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu || !apb_timer_block_enabled) + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); @@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, int timer_num; struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + BUG_ON(!apbt_virt_address); + timer_num = adev->num; pr_debug("%s CPU %d timer %d mode=%d\n", __func__, first_cpu(*evt->cpumask), timer_num, mode); @@ -676,7 +655,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index ceaebeb5866..636b53bd419 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,6 +25,29 @@ #include #include +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_mrst_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +int mrst_timer_options __cpuinitdata; + static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; static int mrst_cpu_chip; @@ -169,18 +192,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } -/* - * the secondary clock in Moorestown can be APBT or LAPIC clock, default to - * APBT but cmdline option can also override it. - */ -static void __cpuinit mrst_setup_secondary_clock(void) -{ - /* restore default lapic clock if disabled by cmdline */ - if (disable_apbt_percpu) - return setup_secondary_APIC_clock(); - apbt_setup_secondary_clock(); -} - static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; @@ -197,6 +208,21 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + switch (mrst_timer_options) { + case MRST_TIMER_APBT_ONLY: + break; + case MRST_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); @@ -207,17 +233,6 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -/* - * if we use per cpu apb timer, the bootclock already setup. if we use lapic - * timer and one apbt timer for broadcast, we need to set up lapic boot clock. - */ -static void __init mrst_setup_boot_clock(void) -{ - pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); - if (disable_apbt_percpu) - setup_boot_APIC_clock(); -}; - int mrst_identify_cpu(void) { return mrst_cpu_chip; @@ -250,13 +265,13 @@ void __init x86_mrst_early_setup(void) x86_init.resources.reserve_resources = x86_init_noop; x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.oem.arch_setup = mrst_arch_setup; - x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; @@ -269,3 +284,26 @@ void __init x86_mrst_early_setup(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; } + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + mrst_timer_options = MRST_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + mrst_timer_options = MRST_TIMER_LAPIC_APBT; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); -- cgit v1.2.3 From a75af580bb1fd261bf63cc00e4b324e17ceb15cf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 13:40:14 -0700 Subject: x86, mrst: make mrst_identify_cpu() an inline returning enum We have an enum, might as well use it. While we're at it, make it an inline... there is really no point in calling a function for this stuff. LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Cc: Jacob Pan --- arch/x86/include/asm/mrst.h | 7 ++++++- arch/x86/kernel/mrst.c | 17 ++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 67ad3154577..1869c18d15c 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,6 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); -extern int mrst_identify_cpu(void); extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); @@ -27,6 +26,12 @@ enum mrst_cpu_type { MRST_CPU_CHIP_PENWELL, }; +extern enum mrst_cpu_type __mrst_cpu_chip; +static enum mrst_cpu_type mrst_identify_cpu(void) +{ + return __mrst_cpu_chip; +} + enum mrst_timer_options { MRST_TIMER_DEFAULT, MRST_TIMER_APBT_ONLY, diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 636b53bd419..967f2686adb 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -50,7 +50,8 @@ int mrst_timer_options __cpuinitdata; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -static int mrst_cpu_chip; +enum mrst_cpu_type __mrst_cpu_chip; +EXPORT_SYMBOL_GPL(__mrst_cpu_chip); int sfi_mtimer_num; @@ -233,25 +234,19 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -int mrst_identify_cpu(void) -{ - return mrst_cpu_chip; -} -EXPORT_SYMBOL_GPL(mrst_identify_cpu); - void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; else { pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", boot_cpu_data.x86, boot_cpu_data.x86_model); - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; } pr_debug("Moorestown CPU %s identified\n", - (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? "Lincroft" : "Penwell"); } -- cgit v1.2.3 From 14671386dcbafb3086bbda3cb6f9f27d34c7bf6d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 14:37:40 -0700 Subject: x86, mrst: make mrst_timer_options an enum We have an enum mrst_timer_options, use it so that the kernel knows if we're missing something from a switch statement or equivalent. Signed-off-by: H. Peter Anvin LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Cc: Thomas Gleixner Cc: Jacob Pan --- arch/x86/include/asm/mrst.h | 3 ++- arch/x86/kernel/mrst.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 1869c18d15c..16350740edf 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,6 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); -extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); /* @@ -38,6 +37,8 @@ enum mrst_timer_options { MRST_TIMER_LAPIC_APBT, }; +extern enum mrst_timer_options mrst_timer_options; + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 967f2686adb..7ee4ed901ba 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -46,7 +46,7 @@ * lapic (always-on,ARAT) ------ 150 */ -int mrst_timer_options __cpuinitdata; +__cpuinitdata enum mrst_timer_options mrst_timer_options; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -- cgit v1.2.3 From c0011dbfce69467b23b08fb4a64c39a409a935fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 4 Feb 2010 14:46:34 -0800 Subject: xen: use _PAGE_IOMAP in ioremap to do machine mappings In a Xen domain, ioremap operates on machine addresses, not pseudo-physical addresses. We use _PAGE_IOMAP to determine whether a mapping is intended for machine addresses. [ Impact: allow Xen domain to map real hardware ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 8 ++--- arch/x86/xen/mmu.c | 71 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 018a0a40079..bf5f7d32bd0 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) */ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) { - extern unsigned long max_mapnr; unsigned long pfn = mfn_to_pfn(mfn); - if ((pfn < max_mapnr) - && !xen_feature(XENFEAT_auto_translated_physmap) - && (get_phys_to_machine(pfn) != mfn)) - return max_mapnr; /* force !pfn_valid() */ - /* XXX fixme; not true with sparsemem */ + if (get_phys_to_machine(pfn) != mfn) + return -1; /* force !pfn_valid() */ return pfn; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce..a4dea9df0cc 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,9 +56,11 @@ #include #include +#include #include #include #include +#include #include #include "multicalls.h" @@ -377,6 +379,28 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } +static bool xen_iomap_pte(pte_t pte) +{ + return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP); +} + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = arbitrary_virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -453,6 +477,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + if (xen_iomap_pte(pteval)) { + xen_set_iomap_pte(ptep, pteval); + goto out; + } + ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -523,8 +552,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) return val; } +static pteval_t iomap_pte(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + + /* We assume the pte frame number is a MFN, so + just use it as-is. */ + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +} + pteval_t xen_pte_val(pte_t pte) { + if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) + return pte.pte; + return pte_mfn_to_pfn(pte.pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -537,7 +583,11 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { - pte = pte_pfn_to_mfn(pte); + if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP))) + pte = iomap_pte(pte); + else + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); @@ -593,6 +643,11 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + ADD_STATS(pte_update, 1); // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); @@ -609,6 +664,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte) #ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + set_64bit((u64 *)ptep, native_pte_val(pte)); } @@ -1811,9 +1871,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) pte = pfn_pte(phys, prot); break; - default: + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ pte = mfn_pte(phys, prot); break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + break; } __native_set_fixmap(idx, pte); -- cgit v1.2.3 From 7347b4082e55ac4a673f06a0a0ce25c37273c9ec Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 19 Feb 2010 13:31:06 -0500 Subject: xen: Allow unprivileged Xen domains to create iomap pages PV DomU domains are allowed to map hardware MFNs for PCI passthrough, but are not generally allowed to map raw machine pages. In particular, various pieces of code try to map DMI and ACPI tables in the ISA ROM range. We disallow _PAGE_IOMAP for those mappings, so that they are redirected to a set of local zeroed pages we reserve for that purpose. [ Impact: prevent passthrough of ISA space, as we only allow PCI ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ++++ arch/x86/xen/mmu.c | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a..3254e8bc4cd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1145,6 +1145,10 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + __supported_pte_mask |= _PAGE_IOMAP; /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a4dea9df0cc..a5577f59416 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -381,7 +382,7 @@ static bool xen_page_pinned(void *ptr) static bool xen_iomap_pte(pte_t pte) { - return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP); + return pte_flags(pte) & _PAGE_IOMAP; } static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) @@ -583,10 +584,21 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { - if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP))) + phys_addr_t addr = (pte & PTE_PFN_MASK); + + /* + * Unprivileged domains are allowed to do IOMAPpings for + * PCI passthrough, but not map ISA space. The ISA + * mappings are just dummy local mappings to keep other + * parts of the kernel happy. + */ + if (unlikely(pte & _PAGE_IOMAP) && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { pte = iomap_pte(pte); - else + } else { + pte &= ~_PAGE_IOMAP; pte = pte_pfn_to_mfn(pte); + } return native_make_pte(pte); } -- cgit v1.2.3 From 19001c8c5bfa032ed45b10dfe48e355f5df88c61 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: xen: Rename the balloon lock * xen_create_contiguous_region needs access to the balloon lock to ensure memory doesn't change under its feet, so expose the balloon lock * Change the name of the lock to xen_reservation_lock, to imply it's now less-specific usage. [ Impact: cleanup ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a5577f59416..9e0d82fc21e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -70,6 +70,13 @@ #define MMU_UPDATE_HISTO 30 +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +DEFINE_SPINLOCK(xen_reservation_lock); + #ifdef CONFIG_XEN_DEBUG_FS static struct { -- cgit v1.2.3 From 08bbc9da92f7e44b9c208c6a1adba70c403b255e Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: xen: Add xen_create_contiguous_region A memory region must be physically contiguous in order to be accessed through DMA. This patch adds xen_create_contiguous_region, which ensures a region of contiguous virtual memory is also physically contiguous. Based on Stephen Tweedie's port of the 2.6.18-xen version. Remove contiguous_bitmap[] as it's no longer needed. Ported from linux-2.6.18-xen.hg 707:e410857fd83c [ Impact: add Xen-internal API to make pages phys-contig ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9e0d82fc21e..eb51402dd99 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -2027,6 +2028,206 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + vm_unmap_aliases(); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + vm_unmap_aliases(); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; -- cgit v1.2.3 From e768aee89c687a50e6a2110e30c5cae1fbf0d2da Mon Sep 17 00:00:00 2001 From: Livio Soares Date: Thu, 3 Jun 2010 15:00:31 -0400 Subject: perf, x86: Small fix to cpuid10_edx Fixes to 'cpuid10_edx' to comply with Intel documentation. According to the Intel Manual, Volume 2A, Table 3-12, the cpuid for architecture performance monitoring returns, in EDX, two pieces of information: 1) Number of fixed-function counters (5 bits, not 4) 2) Width of fixed-function counters (8 bits) Signed-off-by: Livio Soares Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arjan van de Ven Cc: "H. Peter Anvin" LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d0c7e..6ed3ae4f548 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -68,8 +68,9 @@ union cpuid10_eax { union cpuid10_edx { struct { - unsigned int num_counters_fixed:4; - unsigned int reserved:28; + unsigned int num_counters_fixed:5; + unsigned int bit_width_fixed:8; + unsigned int reserved:19; } split; unsigned int full; }; -- cgit v1.2.3 From 12a6611fa16e9c6d2f844fe2175d219c6e9bd95d Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: Calculate BAU destination timeout Calculate the Broadcast Assist Unit's destination timeout period from the values in the relevant MMR's. Store it in each cpu's per-cpu BAU structure so that a destination timeout can be differentiated from a 'plugged' situation in which all software ack resources are already allocated and a timeout is pending. That case returns an immediate destination error. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 12 ++++++++++ arch/x86/kernel/tlb_uv.c | 51 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index aa558ac0306..458e04c626a 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -49,6 +49,18 @@ #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL +/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ +#define BAU_MISC_CONTROL_MULT_MASK 3 + +#define UVH_AGING_PRESCALE_SEL 0x000000b000UL +/* [30:28] URGENCY_7 an index into a table of times */ +#define BAU_URGENCY_7_SHIFT 28 +#define BAU_URGENCY_7_MASK 7 + +#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL +/* [45:40] BAU - BAU transaction timeout select - a multiplier */ +#define BAU_TRANS_SHIFT 40 +#define BAU_TRANS_MASK 0x3f /* * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7fea555929e..5506836c4a8 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -30,6 +30,19 @@ struct msg_desc { struct bau_payload_queue_entry *va_queue_last; }; +/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ +static int timeout_base_ns[] = { + 20, + 160, + 1280, + 10240, + 81920, + 655360, + 5242880, + 167772160 +}; +static int timeout_us; + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL static int uv_bau_max_concurrent __read_mostly; @@ -423,7 +436,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * pending. In that case hardware returns the * ERROR that looks like a destination timeout. */ - if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { + if (cycles_2_us(ttime - bcp->send_message) < + timeout_us) { bcp->conseccompletes = 0; return FLUSH_RETRY_PLUGGED; } @@ -908,12 +922,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) } static inline unsigned long long -millisec_2_cycles(unsigned long millisec) +microsec_2_cycles(unsigned long microsec) { unsigned long ns; unsigned long long cyc; - ns = millisec * 1000; + ns = microsec * 1000; cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); return cyc; } @@ -1258,6 +1272,33 @@ static void __init uv_init_uvhub(int uvhub, int vector) ((apicid << 32) | vector)); } +/* + * We will set BAU_MISC_CONTROL with a timeout period. + * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. + * So the destination timeout period has be be calculated from them. + */ +static int +calculate_destination_timeout(void) +{ + unsigned long mmr_image; + int mult1; + int mult2; + int index; + int base; + int ret; + unsigned long ts_ns; + + mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; + mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); + mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; + base = timeout_base_ns[index]; + ts_ns = base * mult1 * mult2; + ret = ts_ns / 1000; + return ret; +} + /* * initialize the bau_control structure for each cpu */ @@ -1286,6 +1327,8 @@ static void uv_init_per_cpu(int nuvhubs) }; struct uvhub_desc *uvhub_descs; + timeout_us = calculate_destination_timeout(); + uvhub_descs = (struct uvhub_desc *) kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); @@ -1301,7 +1344,7 @@ static void uv_init_per_cpu(int nuvhubs) bdp->uvhub = uvhub; bdp->pnode = pnode; /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = millisec_2_cycles(3); + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) -- cgit v1.2.3 From e8e5e8a8048006a12d7777a93baebd6e39496101 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: BAU tunables into a debugfs file Make the Broadcast Assist Unit driver's nine tuning values variable by making them accessible through a read/write debugfs file. The file will normally be mounted as /sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each cpu's per-cpu BAU structure. The patch also does a little name improvement, and corrects the reset of two destination timeout counters. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 53 +++++--- arch/x86/kernel/tlb_uv.c | 281 +++++++++++++++++++++++++++++++++------ 2 files changed, 278 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 458e04c626a..e5543c1a80c 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -45,10 +45,14 @@ #define UV_DESC_BASE_PNODE_SHIFT 49 #define UV_PAYLOADQ_PNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define UV_BAU_BASENAME "sgi_uv/bau_tunables" +#define UV_BAU_TUNABLES_DIR "sgi_uv" +#define UV_BAU_TUNABLES_FILE "bau_tunables" +#define WHITESPACE " \t\n" #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL +#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL /* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -70,25 +74,23 @@ #define DESC_STATUS_DESTINATION_TIMEOUT 2 #define DESC_STATUS_SOURCE_TIMEOUT 3 +#define TIMEOUT_DELAY 10 /* - * source side threshholds at which message retries print a warning - */ -#define SOURCE_TIMEOUT_LIMIT 20 -#define DESTINATION_TIMEOUT_LIMIT 20 - -/* - * misc. delays, in microseconds + * delay for 'plugged' timeout retries, in microseconds */ -#define THROTTLE_DELAY 10 -#define TIMEOUT_DELAY 10 -#define BIOS_TO 1000 -/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */ +#define PLUGGED_DELAY 10 /* * threshholds at which to use IPI to free resources */ +/* after this # consecutive 'plugged' timeouts, use IPI to release resources */ #define PLUGSB4RESET 100 -#define TIMEOUTSB4RESET 100 +/* after this many consecutive timeouts, use IPI to release resources */ +#define TIMEOUTSB4RESET 1 +/* at this number uses of IPI to release resources, giveup the request */ +#define IPI_RESET_LIMIT 1 +/* after this # consecutive successes, bump up the throttle if it was lowered */ +#define COMPLETE_THRESHOLD 5 /* * number of entries in the destination side payload queue @@ -107,6 +109,13 @@ #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +/* + * tuning the action when the numalink network is extremely delayed + */ +#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ +#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ +#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ + /* * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) * If the 'multilevel' flag in the header portion of the descriptor @@ -323,14 +332,13 @@ struct bau_control { struct bau_control *uvhub_master; struct bau_control *socket_master; unsigned long timeout_interval; + unsigned long set_bau_on_time; atomic_t active_descriptor_count; - int max_concurrent; - int max_concurrent_constant; - int retry_message_scans; int plugged_tries; int timeout_tries; int ipi_attempts; int conseccompletes; + int set_bau_off; short cpu; short uvhub_cpu; short uvhub; @@ -343,6 +351,19 @@ struct bau_control { spinlock_t masks_lock; spinlock_t uvhub_lock; spinlock_t queue_lock; + /* tunables */ + int max_bau_concurrent; + int max_bau_concurrent_constant; + int plugged_delay; + int plugsb4reset; + int timeoutsb4reset; + int ipi_reset_limit; + int complete_threshold; + int congested_response_us; + int congested_reps; + int congested_period; + cycles_t period_time; + long period_requests; }; /* diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 5506836c4a8..c8661779c51 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include @@ -42,12 +43,22 @@ static int timeout_base_ns[] = { 167772160 }; static int timeout_us; +static int nobau; -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL - -static int uv_bau_max_concurrent __read_mostly; +/* tunables: */ +static int max_bau_concurrent = MAX_BAU_CONCURRENT; +static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; +static int plugged_delay = PLUGGED_DELAY; +static int plugsb4reset = PLUGSB4RESET; +static int timeoutsb4reset = TIMEOUTSB4RESET; +static int ipi_reset_limit = IPI_RESET_LIMIT; +static int complete_threshold = COMPLETE_THRESHOLD; +static int congested_response_us = CONGESTED_RESPONSE_US; +static int congested_reps = CONGESTED_REPS; +static int congested_period = CONGESTED_PERIOD; +static struct dentry *tunables_dir; +static struct dentry *tunables_file; -static int nobau; static int __init setup_nobau(char *arg) { nobau = 1; @@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, unsigned long index; cycles_t time1; cycles_t time2; + cycles_t elapsed; struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; /* - * Spin here while there are hmaster->max_concurrent or more active + * Spin here while there are hmaster->max_bau_concurrent or more active * descriptors. This is the per-uvhub 'throttle'. */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)) { + hmaster->max_bau_concurrent)) { stat->s_throttles++; do { cpu_relax(); } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)); + hmaster->max_bau_concurrent)); } while (hmaster->uvhub_quiesce) @@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, * that case hardware immediately returns the ERROR * that looks like a destination timeout. */ - udelay(TIMEOUT_DELAY); + udelay(bcp->plugged_delay); bcp->plugged_tries++; - if (bcp->plugged_tries >= PLUGSB4RESET) { + if (bcp->plugged_tries >= bcp->plugsb4reset) { bcp->plugged_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_plug++; } } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_concurrent = 1; + hmaster->max_bau_concurrent = 1; bcp->timeout_tries++; udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= TIMEOUTSB4RESET) { + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { bcp->timeout_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_timeout++; } } - if (bcp->ipi_attempts >= 3) { + if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; completion_status = FLUSH_GIVEUP; break; @@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) - && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) - hmaster->max_concurrent++; + bcp->plugged_tries = 0; + bcp->timeout_tries = 0; + + if ((completion_status == FLUSH_COMPLETE) && + (bcp->conseccompletes > bcp->complete_threshold) && + (hmaster->max_bau_concurrent < + hmaster->max_bau_concurrent_constant)) + hmaster->max_bau_concurrent++; /* * hold any cpu not timing out here; no other cpu currently held by @@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, atomic_dec(&hmaster->active_descriptor_count); /* guard against cycles wrap */ - if (time2 > time1) - stat->s_time += (time2 - time1); - else + if (time2 > time1) { + elapsed = time2 - time1; + stat->s_time += elapsed; + } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; @@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct ptc_stats *stat; struct bau_control *bcp; + /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + /* * Each sending cpu has a per-cpu mask which it fills from the caller's * cpu mask. Only remote cpus are converted to uvhubs and copied. @@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, stat->s_busy, stat->s_throttles); + /* destination side statistics */ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", @@ -985,10 +1006,29 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) return 0; } +/* + * Display the tunables thru debugfs + */ +static ssize_t tunables_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[300]; + int ret; + + ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", + "max_bau_concurrent plugged_delay plugsb4reset", + "timeoutsb4reset ipi_reset_limit complete_threshold", + "congested_response_us congested_reps congested_period", + max_bau_concurrent, plugged_delay, plugsb4reset, + timeoutsb4reset, ipi_reset_limit, complete_threshold, + congested_response_us, congested_reps, congested_period); + + return simple_read_from_buffer(userbuf, count, ppos, buf, ret); +} + /* * -1: resetf the statistics * 0: display meaning of the statistics - * >0: maximum concurrent active descriptors per uvhub (throttle) */ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data) @@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, long input_arg; char optstr[64]; struct ptc_stats *stat; - struct bau_control *bcp; if (count == 0 || count > sizeof(optstr)) return -EINVAL; @@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, stat = &per_cpu(ptcstats, cpu); memset(stat, 0, sizeof(struct ptc_stats)); } - } else { - uv_bau_max_concurrent = input_arg; - bcp = &per_cpu(bau_control, smp_processor_id()); - if (uv_bau_max_concurrent < 1 || - uv_bau_max_concurrent > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG - "Error: BAU max concurrent %d; %d is invalid\n", - bcp->max_concurrent, uv_bau_max_concurrent); - return -EINVAL; - } - printk(KERN_DEBUG "Set BAU max concurrent:%d\n", - uv_bau_max_concurrent); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_concurrent = uv_bau_max_concurrent; + } + + return count; +} + +static int local_atoi(const char *name) +{ + int val = 0; + + for (;; name++) { + switch (*name) { + case '0' ... '9': + val = 10*val+(*name-'0'); + break; + default: + return val; } } +} + +/* + * set the tunables + * 0 values reset them to defaults + */ +static ssize_t tunables_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + int cpu; + int cnt = 0; + int val; + char *p; + char *q; + char instr[64]; + struct bau_control *bcp; + if (count == 0 || count > sizeof(instr)-1) + return -EINVAL; + if (copy_from_user(instr, user, count)) + return -EFAULT; + + instr[count] = '\0'; + /* count the fields */ + p = instr + strspn(instr, WHITESPACE); + q = p; + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + cnt++; + if (q == p) + break; + } + if (cnt != 9) { + printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); + return -EINVAL; + } + + p = instr + strspn(instr, WHITESPACE); + q = p; + for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { + q = p + strcspn(p, WHITESPACE); + val = local_atoi(p); + switch (cnt) { + case 0: + if (val == 0) { + max_bau_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent_constant = + MAX_BAU_CONCURRENT; + continue; + } + bcp = &per_cpu(bau_control, smp_processor_id()); + if (val < 1 || val > bcp->cpus_in_uvhub) { + printk(KERN_DEBUG + "Error: BAU max concurrent %d is invalid\n", + val); + return -EINVAL; + } + max_bau_concurrent = val; + max_bau_concurrent_constant = val; + continue; + case 1: + if (val == 0) + plugged_delay = PLUGGED_DELAY; + else + plugged_delay = val; + continue; + case 2: + if (val == 0) + plugsb4reset = PLUGSB4RESET; + else + plugsb4reset = val; + continue; + case 3: + if (val == 0) + timeoutsb4reset = TIMEOUTSB4RESET; + else + timeoutsb4reset = val; + continue; + case 4: + if (val == 0) + ipi_reset_limit = IPI_RESET_LIMIT; + else + ipi_reset_limit = val; + continue; + case 5: + if (val == 0) + complete_threshold = COMPLETE_THRESHOLD; + else + complete_threshold = val; + continue; + case 6: + if (val == 0) + congested_response_us = CONGESTED_RESPONSE_US; + else + congested_response_us = val; + continue; + case 7: + if (val == 0) + congested_reps = CONGESTED_REPS; + else + congested_reps = val; + continue; + case 8: + if (val == 0) + congested_period = CONGESTED_PERIOD; + else + congested_period = val; + continue; + } + if (q == p) + break; + } + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } return count; } @@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file) return seq_open(file, &uv_ptc_seq_ops); } +static int tunables_open(struct inode *inode, struct file *file) +{ + return 0; +} + static const struct file_operations proc_uv_ptc_operations = { .open = uv_ptc_proc_open, .read = seq_read, @@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = { .release = seq_release, }; +static const struct file_operations tunables_fops = { + .open = tunables_open, + .read = tunables_read, + .write = tunables_write, +}; + static int __init uv_ptc_init(void) { struct proc_dir_entry *proc_uv_ptc; @@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void) UV_PTC_BASENAME); return -EINVAL; } + + tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); + if (!tunables_dir) { + printk(KERN_ERR "unable to create debugfs directory %s\n", + UV_BAU_TUNABLES_DIR); + return -EINVAL; + } + tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, + tunables_dir, NULL, &tunables_fops); + if (!tunables_file) { + printk(KERN_ERR "unable to create debugfs file %s\n", + UV_BAU_TUNABLES_FILE); + return -EINVAL; + } return 0; } @@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); spin_lock_init(&bcp->masks_lock); - bcp->max_concurrent = uv_bau_max_concurrent; pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) @@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs) } } kfree(uvhub_descs); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } } /* @@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - uv_bau_max_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); @@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void) return 0; } core_initcall(uv_bau_init); -core_initcall(uv_ptc_init); +fs_initcall(uv_ptc_init); -- cgit v1.2.3 From 50fb55acc5bbe5ee29d0a65262f4ec286b14d156 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Disable BAU on network congestion The numalink network can become so congested that TLB shootdown using the Broadcast Assist Unit becomes slower than using IPI's. In that case, disable the use of the BAU for a period of time. The period is tunable. When the period expires the use of the BAU is re-enabled. A count of these actions is added to the statistics file. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 4 +++ arch/x86/kernel/tlb_uv.c | 76 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index e5543c1a80c..9b3e750ef2d 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -34,6 +34,7 @@ */ #define UV_ITEMS_PER_DESCRIPTOR 8 +/* the 'throttle' to prevent the hardware stay-busy bug */ #define MAX_BAU_CONCURRENT 3 #define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 @@ -338,6 +339,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; + int baudisabled; int set_bau_off; short cpu; short uvhub_cpu; @@ -389,6 +391,8 @@ struct ptc_stats { unsigned long s_busy; /* status stayed busy past s/w timer */ unsigned long s_throttles; /* waits in throttle */ unsigned long s_retry_messages; /* retry broadcasts */ + unsigned long s_bau_reenabled; /* for bau enable/disable */ + unsigned long s_bau_disabled; /* for bau enable/disable */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index c8661779c51..dc6a6831275 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -44,6 +44,9 @@ static int timeout_base_ns[] = { }; static int timeout_us; static int nobau; +static int baudisabled; +static spinlock_t disable_lock; +static cycles_t congested_cycles; /* tunables: */ static int max_bau_concurrent = MAX_BAU_CONCURRENT; @@ -519,6 +522,35 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +/* + * Completions are taking a very long time due to a congested numalink + * network. + */ +static void +disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + + /* let only one cpu do this disabling */ + spin_lock(&disable_lock); + if (!baudisabled && bcp->period_requests && + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { + /* it becomes this cpu's job to turn on the use of the + BAU again */ + baudisabled = 1; + bcp->set_bau_off = 1; + bcp->set_bau_on_time = get_cycles() + + sec_2_cycles(bcp->congested_period); + stat->s_bau_disabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 1; + } + } + spin_unlock(&disable_lock); +} + /** * uv_flush_send_and_wait * @@ -681,6 +713,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { + bcp->period_requests++; + bcp->period_time += elapsed; + if ((elapsed > congested_cycles) && + (bcp->period_requests > bcp->congested_reps)) { + disable_for_congestion(bcp, stat); + } + } } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) @@ -747,12 +787,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + struct bau_control *tbcp; /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + stat = &per_cpu(ptcstats, cpu); + + /* bau was disabled due to slow response */ + if (bcp->baudisabled) { + /* the cpu that disabled it must re-enable it */ + if (bcp->set_bau_off) { + if (get_cycles() >= bcp->set_bau_on_time) { + stat->s_bau_reenabled++; + baudisabled = 0; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 0; + tbcp->period_requests = 0; + tbcp->period_time = 0; + } + } + } + return cpumask; + } /* * Each sending cpu has a per-cpu mask which it fills from the caller's @@ -793,7 +853,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, else return NULL; } - stat = &per_cpu(ptcstats, cpu); stat->s_requestor++; stat->s_ntargcpu += remotes; remotes = bau_uvhub_weight(&bau_desc->distribution); @@ -973,7 +1032,9 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "sw_ack recv rtime all "); seq_printf(file, - "one mult none retry canc nocan reset rcan\n"); + "one mult none retry canc nocan reset rcan "); + seq_printf(file, + "disable enable\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -993,7 +1054,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", uv_read_global_mmr64(uv_cpu_to_pnode(cpu), UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), stat->d_requestee, cycles_2_us(stat->d_time), @@ -1001,6 +1062,8 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); + seq_printf(file, "%ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled); } return 0; @@ -1112,6 +1175,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, "reset: number of ipi-style reset requests processed\n"); printk(KERN_DEBUG "rcan: number messages canceled by reset requests\n"); + printk(KERN_DEBUG + "disable: number times use of the BAU was disabled\n"); + printk(KERN_DEBUG + "enable: number times use of the BAU was re-enabled\n"); } else if (input_arg == -1) { for_each_present_cpu(cpu) { stat = &per_cpu(ptcstats, cpu); @@ -1568,6 +1635,7 @@ static void uv_init_per_cpu(int nuvhubs) kfree(uvhub_descs); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); + bcp->baudisabled = 0; /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1609,6 +1677,8 @@ static int __init uv_bau_init(void) uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); + spin_lock_init(&disable_lock); + congested_cycles = microsec_2_cycles(congested_response_us); uv_init_per_cpu(nuvhubs); -- cgit v1.2.3 From 712157aa703a01f58c7c17452096ab00b774d0a9 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Shorten access to BAU statistics structure Use a pointer from the per-cpu BAU control structure to the per-cpu BAU statistics structure. We nearly always know the first before needing the second. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 + arch/x86/kernel/tlb_uv.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 9b3e750ef2d..6a42d42eb8f 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -332,6 +332,7 @@ struct bau_control { struct bau_payload_queue_entry *bau_msg_head; struct bau_control *uvhub_master; struct bau_control *socket_master; + struct ptc_stats *statp; unsigned long timeout_interval; unsigned long set_bau_on_time; atomic_t active_descriptor_count; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc6a6831275..261b9653cde 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -153,7 +153,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, struct ptc_stats *stat; msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_retries++; /* * cancel any message from msg+1 to the retry itself @@ -217,7 +217,7 @@ static void uv_bau_process_message(struct msg_desc *mdp, * This must be a normal message, or retry of a normal message */ msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); stat->d_alltlb++; @@ -301,7 +301,7 @@ uv_do_reset(void *ptr) bcp = &per_cpu(bau_control, smp_processor_id()); rap = (struct reset_args *)ptr; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_resets++; /* @@ -419,7 +419,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mask; cycles_t ttime; cycles_t timeout_time; - struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; @@ -583,7 +583,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, cycles_t time1; cycles_t time2; cycles_t elapsed; - struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; @@ -794,7 +794,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, return cpumask; bcp = &per_cpu(bau_control, cpu); - stat = &per_cpu(ptcstats, cpu); + stat = bcp->statp; /* bau was disabled due to slow response */ if (bcp->baudisabled) { @@ -903,7 +903,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); - stat = &per_cpu(ptcstats, smp_processor_id()); + stat = bcp->statp; msgdesc.va_queue_first = bcp->va_queue_first; msgdesc.va_queue_last = bcp->va_queue_last; msg = bcp->bau_msg_head; @@ -1636,6 +1636,7 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1673,7 +1674,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); -- cgit v1.2.3 From 4faca1550838708d71f6eea14cdacb0876c3a5a4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU structure rearranging Move some structure definitions from the C code to the BAU header file, and change the organization of that header file a little. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 90 +++++++++++++++++++++++----------------- arch/x86/kernel/tlb_uv.c | 12 ------ 2 files changed, 51 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 6a42d42eb8f..1c8f1e9bf74 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -322,6 +322,57 @@ struct bau_payload_queue_entry { /* bytes 24-31 */ }; +struct msg_desc { + struct bau_payload_queue_entry *msg; + int msg_slot; + int sw_ack_slot; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; +}; + +struct reset_args { + int sender; +}; + +/* + * This structure is allocated per_cpu for UV TLB shootdown statistics. + */ +struct ptc_stats { + /* sender statistics */ + unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ + unsigned long s_requestor; /* number of shootdown requests */ + unsigned long s_stimeout; /* source side timeouts */ + unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_time; /* time spent in sending side */ + unsigned long s_retriesok; /* successful retries */ + unsigned long s_ntargcpu; /* total number of cpu's targeted */ + unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ + unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ + unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ + unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */ + unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */ + unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */ + unsigned long s_resets_plug; /* ipi-style resets from plug state */ + unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ + unsigned long s_busy; /* status stayed busy past s/w timer */ + unsigned long s_throttles; /* waits in throttle */ + unsigned long s_retry_messages; /* retry broadcasts */ + unsigned long s_bau_reenabled; /* for bau enable/disable */ + unsigned long s_bau_disabled; /* for bau enable/disable */ + /* destination statistics */ + unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ + unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ + unsigned long d_multmsg; /* interrupts with multiple messages */ + unsigned long d_nomsg; /* interrupts with no message */ + unsigned long d_time; /* time spent on destination side */ + unsigned long d_requestee; /* number of messages processed */ + unsigned long d_retries; /* number of retry messages processed */ + unsigned long d_canceled; /* number of messages canceled by retries */ + unsigned long d_nocanceled; /* retries that found nothing to cancel */ + unsigned long d_resets; /* number of ipi-style requests processed */ + unsigned long d_rcanceled; /* number of messages canceled by resets */ +}; + /* * one per-cpu; to locate the software tables */ @@ -369,45 +420,6 @@ struct bau_control { long period_requests; }; -/* - * This structure is allocated per_cpu for UV TLB shootdown statistics. - */ -struct ptc_stats { - /* sender statistics */ - unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ - unsigned long s_requestor; /* number of shootdown requests */ - unsigned long s_stimeout; /* source side timeouts */ - unsigned long s_dtimeout; /* destination side timeouts */ - unsigned long s_time; /* time spent in sending side */ - unsigned long s_retriesok; /* successful retries */ - unsigned long s_ntargcpu; /* number of cpus targeted */ - unsigned long s_ntarguvhub; /* number of uvhubs targeted */ - unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */ - unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */ - unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */ - unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */ - unsigned long s_ntarguvhub1; /* number of times == 1 target hub */ - unsigned long s_resets_plug; /* ipi-style resets from plug state */ - unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ - unsigned long s_busy; /* status stayed busy past s/w timer */ - unsigned long s_throttles; /* waits in throttle */ - unsigned long s_retry_messages; /* retry broadcasts */ - unsigned long s_bau_reenabled; /* for bau enable/disable */ - unsigned long s_bau_disabled; /* for bau enable/disable */ - /* destination statistics */ - unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ - unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ - unsigned long d_multmsg; /* interrupts with multiple messages */ - unsigned long d_nomsg; /* interrupts with no message */ - unsigned long d_time; /* time spent on destination side */ - unsigned long d_requestee; /* number of messages processed */ - unsigned long d_retries; /* number of retry messages processed */ - unsigned long d_canceled; /* number of messages canceled by retries */ - unsigned long d_nocanceled; /* retries that found nothing to cancel */ - unsigned long d_resets; /* number of ipi-style requests processed */ - unsigned long d_rcanceled; /* number of messages canceled by resets */ -}; - static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 261b9653cde..d7592903984 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,14 +23,6 @@ #include #include -struct msg_desc { - struct bau_payload_queue_entry *msg; - int msg_slot; - int sw_ack_slot; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; -}; - /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { 20, @@ -79,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); -struct reset_args { - int sender; -}; - /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. -- cgit v1.2.3 From 39847e7f3c8198b14102fe7ba4b3a6a1d84bbcfe Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU software acknowledge Correct the acknowledgment and the reset of a BAU software-acknowledged message. A retry message should be testing only for timed-out resources (mask << 8). (And we delete a log message that might cause unnecessary concern) The acknowledge MMR is |--timed-out--|---pending--|, each is 8 bits. The IPI-driven reset of software acknowledge resources frees both timed out and pending resources. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d7592903984..295a41122da 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -161,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, slot2 = msg2 - mdp->va_queue_first; mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg2->sw_ack_vector << 8) | - msg2->sw_ack_vector); + msg_res = msg2->sw_ack_vector; /* * This is a message retry; clear the resources held * by the previous message only if they timed out. * If it has not timed out we have an unexpected * situation to report. */ - if (mmr & (msg_res << 8)) { + if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { /* * is the resource timed out? * make everyone ignore the cancelled message. @@ -179,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, cancel_count++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << 8) | msg_res); - } else - printk(KERN_INFO "note bau retry: no effect\n"); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); + } } } if (!cancel_count) @@ -317,13 +316,13 @@ uv_do_reset(void *ptr) */ mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg->sw_ack_vector << 8) | - msg->sw_ack_vector); + msg_res = msg->sw_ack_vector; if (mmr & msg_res) { stat->d_rcanceled++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - msg_res); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); } } } -- cgit v1.2.3 From a8328ee58c15c9d763a67607a35bb987b38950fa Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU discovery of hubs and sockets Correct the initialization-time assumption of contigous blade numbers and of sockets numbered from zero. There may be hubs present with no cpu's enabled. There may be disabled sockets such that the active socket is not number zero. And assign a 'socket master' by assuming that a socket is a node. (it is not safe to extract socket number from an apicid) Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 49 ++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 295a41122da..ab929e97650 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1547,11 +1547,13 @@ calculate_destination_timeout(void) */ static void uv_init_per_cpu(int nuvhubs) { - int i, j, k; + int i; int cpu; int pnode; int uvhub; short socket = 0; + unsigned short socket_mask; + unsigned int uvhub_mask; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; @@ -1562,7 +1564,7 @@ static void uv_init_per_cpu(int nuvhubs) short cpu_number[16]; }; struct uvhub_desc { - short num_sockets; + unsigned short socket_mask; short num_cpus; short uvhub; short pnode; @@ -1581,43 +1583,54 @@ static void uv_init_per_cpu(int nuvhubs) spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + uvhub_mask |= (1 << uvhub); bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* kludge: assume uv_hub.h is constant */ - socket = (cpu_physical_id(cpu)>>5)&1; - if (socket >= bdp->num_sockets) - bdp->num_sockets = socket+1; + /* kludge: 'assuming' one node per socket, and assuming that + disabling a socket just leaves a gap in node numbers */ + socket = (cpu_to_node(cpu) & 1);; + bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; } - socket = 0; - for_each_possible_blade(uvhub) { + uvhub = 0; + while (uvhub_mask) { + if (!(uvhub_mask & 1)) + goto nexthub; bdp = &uvhub_descs[uvhub]; - for (i = 0; i < bdp->num_sockets; i++) { - sdp = &bdp->socket[i]; - for (j = 0; j < sdp->num_cpus; j++) { - cpu = sdp->cpu_number[j]; + socket_mask = bdp->socket_mask; + socket = 0; + while (socket_mask) { + if (!(socket_mask & 1)) + goto nextsocket; + sdp = &bdp->socket[socket]; + for (i = 0; i < sdp->num_cpus; i++) { + cpu = sdp->cpu_number[i]; bcp = &per_cpu(bau_control, cpu); bcp->cpu = cpu; - if (j == 0) { + if (i == 0) { smaster = bcp; - if (i == 0) + if (socket == 0) hmaster = bcp; } bcp->cpus_in_uvhub = bdp->num_cpus; bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = smaster; + bcp->uvhub = bdp->uvhub; bcp->uvhub_master = hmaster; - for (k = 0; k < DEST_Q_SIZE; k++) - bcp->socket_acknowledge_count[k] = 0; - bcp->uvhub_cpu = - uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> + blade_processor_id; } +nextsocket: socket++; + socket_mask = (socket_mask >> 1); } +nexthub: + uvhub++; + uvhub_mask = (uvhub_mask >> 1); } kfree(uvhub_descs); for_each_present_cpu(cpu) { -- cgit v1.2.3 From 90cc7d944981a6d06b49bb26fde1b490e28c90e5 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Remove BAU check for stay-busy Remove a faulty assumption that a long running BAU request has encountered a hardware problem and will never finish. Numalink congestion can make a request appear to have encountered such a problem, but it is not safe to cancel the request. If such a cancel is done but a reply is later received we can miss a TLB shootdown. We depend upon the max_bau_concurrent 'throttle' to prevent the stay-busy case from happening. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/kernel/tlb_uv.c | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 1c8f1e9bf74..c19b870ea58 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -402,7 +402,6 @@ struct bau_control { unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; cycles_t send_message; - spinlock_t masks_lock; spinlock_t uvhub_lock; spinlock_t queue_lock; /* tunables */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ab929e97650..dc962b5ac87 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -405,12 +405,10 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr; unsigned long mask; cycles_t ttime; - cycles_t timeout_time; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; - timeout_time = get_cycles() + bcp->timeout_interval; /* spin on the status MMR, waiting for it to go idle */ while ((descriptor_status = (((unsigned long) @@ -450,26 +448,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * descriptor_status is still BUSY */ cpu_relax(); - relaxes++; - if (relaxes >= 10000) { - relaxes = 0; - if (get_cycles() > timeout_time) { - quiesce_local_uvhub(hmaster); - - /* single-thread the register change */ - spin_lock(&hmaster->masks_lock); - mmr = uv_read_local_mmr(mmr_offset); - mask = 0UL; - mask |= (3UL < right_shift); - mask = ~mask; - mmr &= mask; - uv_write_local_mmr(mmr_offset, mmr); - spin_unlock(&hmaster->masks_lock); - end_uvhub_quiesce(hmaster); - stat->s_busy++; - return FLUSH_GIVEUP; - } - } } } bcp->conseccompletes++; @@ -1580,7 +1558,6 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); - spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; uvhub_mask |= (1 << uvhub); -- cgit v1.2.3 From 7fba1bcd4844a4a8619a03bf51cabc92aea365a8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU regular message type The Broadcast Assist Unit messages have a regular or retry message type. The regular type was not being set, but needs to be, because the lack of a message type is sometimes used to identify an unused entry in the message queue. Also removing some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc962b5ac87..4cb14dbd7fa 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -580,23 +580,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } time1 = get_cycles(); do { - /* - * Every message from any given cpu gets a unique message - * sequence number. But retries use that same number. - * Our message may have timed out at the destination because - * all sw-ack resources are in use and there is a timeout - * pending there. In that case, our last send never got - * placed into the queue and we need to persist until it - * does. - * - * Make any retry a type MSG_RETRY so that the destination will - * free any resource held by a previous message from this cpu. - */ if (try == 0) { - /* use message type set by the caller the first time */ + bau_desc->header.msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - /* use RETRY type on all the rest; same sequence */ bau_desc->header.msg_type = MSG_RETRY; stat->s_retry_messages++; } -- cgit v1.2.3 From 450a007eebaf430426ea8f89bbc3f287949905b2 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU broadcast to the local hub Make the Broadcast Assist Unit driver use the BAU for TLB shootdowns of cpu's on the local uvhub. It was previously thought that IPI might be faster to the cpu's on the local hub. But the IPI operation would have to follow the completion of the BAU broadcast anyway. So we broadcast to the local uvhub in all cases except when the current cpu was the only local cpu in the mask. This simplifies uv_flush_send_and_wait() in that it returns either all shootdowns complete, or none. Adjust the statistics to account for shootdowns on the local uvhub. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 5 ++ arch/x86/kernel/tlb_uv.c | 138 +++++++++++++++------------------------ 2 files changed, 58 insertions(+), 85 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index c19b870ea58..7f6ea611cb7 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -346,6 +346,11 @@ struct ptc_stats { unsigned long s_time; /* time spent in sending side */ unsigned long s_retriesok; /* successful retries */ unsigned long s_ntargcpu; /* total number of cpu's targeted */ + unsigned long s_ntargself; /* times the sending cpu was targeted */ + unsigned long s_ntarglocals; /* targets of cpus on the local blade */ + unsigned long s_ntargremotes; /* targets of cpus on remote blades */ + unsigned long s_ntarglocaluvhub; /* targets of the local hub */ + unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 4cb14dbd7fa..a1615058fad 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -400,10 +400,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, int this_cpu, struct bau_control *bcp, struct bau_control *smaster, long try) { - int relaxes = 0; unsigned long descriptor_status; - unsigned long mmr; - unsigned long mask; cycles_t ttime; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; @@ -524,25 +521,19 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * The flush_mask contains the cpus the broadcast is to be sent to, plus * cpus that are on the local uvhub. * - * Returns NULL if all flushing represented in the mask was done. The mask - * is zeroed. - * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set, representing any cpus on the local - * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. + * Returns 0 if all flushing represented in the mask was done. + * Returns 1 if it gives up entirely and the original cpu mask is to be + * returned to the kernel. */ -const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, - struct bau_control *bcp) +int uv_flush_send_and_wait(struct bau_desc *bau_desc, + struct cpumask *flush_mask, struct bau_control *bcp) { int right_shift; - int uvhub; - int bit; int completion_status = 0; int seq_number = 0; long try = 0; int cpu = bcp->uvhub_cpu; int this_cpu = bcp->cpu; - int this_uvhub = bcp->uvhub; unsigned long mmr_offset; unsigned long index; cycles_t time1; @@ -552,10 +543,6 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; - /* - * Spin here while there are hmaster->max_bau_concurrent or more active - * descriptors. This is the per-uvhub 'throttle'. - */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)) { @@ -591,9 +578,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - try++; completion_status = uv_wait_completion(bau_desc, mmr_offset, right_shift, this_cpu, bcp, smaster, try); @@ -652,16 +637,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (hmaster->max_bau_concurrent < hmaster->max_bau_concurrent_constant)) hmaster->max_bau_concurrent++; - - /* - * hold any cpu not timing out here; no other cpu currently held by - * the 'throttle' should enter the activation code - */ while (hmaster->uvhub_quiesce) cpu_relax(); atomic_dec(&hmaster->active_descriptor_count); - - /* guard against cycles wrap */ if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; @@ -674,32 +652,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } } } else - stat->s_requestor--; /* don't count this one */ + stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; else if (completion_status == FLUSH_GIVEUP) { - /* - * Cause the caller to do an IPI-style TLB shootdown on - * the target cpu's, all of which are still in the mask. - */ stat->s_giveup++; - return flush_mask; + return 1; } - - /* - * Success, so clear the remote cpu's from the mask so we don't - * use the IPI method of shootdown on them. - */ - for_each_cpu(bit, flush_mask) { - uvhub = uv_cpu_to_blade_id(bit); - if (uvhub == this_uvhub) - continue; - cpumask_clear_cpu(bit, flush_mask); - } - if (!cpumask_empty(flush_mask)) - return flush_mask; - - return NULL; + return 0; } /** @@ -731,10 +691,11 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int remotes; int tcpu; int uvhub; int locals = 0; + int remotes = 0; + int hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; @@ -768,54 +729,52 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. Only remote cpus are converted to uvhubs and copied. + * cpu mask. All cpus are converted to uvhubs and copied to the + * activation descriptor. */ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* - * copy cpumask to flush_mask, removing current cpu - * (current cpu should already have been flushed by the caller and - * should never be returned if we return flush_mask) - */ + /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); if (cpu_isset(cpu, *cpumask)) - locals++; /* current cpu was targeted */ + stat->s_ntargself++; bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - remotes = 0; + + /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { uvhub = uv_cpu_to_blade_id(tcpu); - if (uvhub == bcp->uvhub) { - locals++; - continue; - } bau_uvhub_set(uvhub, &bau_desc->distribution); - remotes++; - } - if (remotes == 0) { - /* - * No off_hub flushing; return status for local hub. - * Return the caller's mask if all were local (the current - * cpu may be in that mask). - */ - if (locals) - return cpumask; + if (uvhub == bcp->uvhub) + locals++; else - return NULL; + remotes++; } + if ((locals + remotes) == 0) + return NULL; stat->s_requestor++; - stat->s_ntargcpu += remotes; + stat->s_ntargcpu += remotes + locals; + stat->s_ntargremotes += remotes; + stat->s_ntarglocals += locals; remotes = bau_uvhub_weight(&bau_desc->distribution); - stat->s_ntarguvhub += remotes; - if (remotes >= 16) + + /* uvhub statistics */ + hubs = bau_uvhub_weight(&bau_desc->distribution); + if (locals) { + stat->s_ntarglocaluvhub++; + stat->s_ntargremoteuvhub += (hubs - 1); + } else + stat->s_ntargremoteuvhub += hubs; + stat->s_ntarguvhub += hubs; + if (hubs >= 16) stat->s_ntarguvhub16++; - else if (remotes >= 8) + else if (hubs >= 8) stat->s_ntarguvhub8++; - else if (remotes >= 4) + else if (hubs >= 4) stat->s_ntarguvhub4++; - else if (remotes >= 2) + else if (hubs >= 2) stat->s_ntarguvhub2++; else stat->s_ntarguvhub1++; @@ -824,10 +783,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc->payload.sending_cpu = cpu; /* - * uv_flush_send_and_wait returns null if all cpu's were messaged, or - * the adjusted flush_mask if any cpu's were not messaged. + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, + * or 1 if it gave up and the original cpumask should be returned. */ - return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); + if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + return NULL; + else + return cpumask; } /* @@ -976,9 +938,11 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) if (!cpu) { seq_printf(file, - "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); + "# cpu sent stime self locals remotes ncpus localhub "); + seq_printf(file, + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto "); seq_printf(file, "retries rok resetp resett giveup sto bz throt "); seq_printf(file, @@ -994,10 +958,14 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", cpu, stat->s_requestor, cycles_2_us(stat->s_time), - stat->s_ntarguvhub, stat->s_ntarguvhub16, + stat->s_ntargself, stat->s_ntarglocals, + stat->s_ntargremotes, stat->s_ntargcpu, + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, + stat->s_ntarguvhub, stat->s_ntarguvhub16); + seq_printf(file, "%ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_ntargcpu, stat->s_dtimeout); + stat->s_dtimeout); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, -- cgit v1.2.3 From f6d8a56693426b1f29ff5cafda8be0d65e4e1870 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Modularize BAU send and wait Streamline the large uv_flush_send_and_wait() function by use of a couple of helper functions. And remove some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/kernel/tlb_uv.c | 82 +++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 7f6ea611cb7..42d412fd8b0 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -75,7 +75,6 @@ #define DESC_STATUS_DESTINATION_TIMEOUT 2 #define DESC_STATUS_SOURCE_TIMEOUT 3 -#define TIMEOUT_DELAY 10 /* * delay for 'plugged' timeout retries, in microseconds */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index a1615058fad..abf3c31f14c 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -484,6 +484,47 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +/* + * Our retries are blocked by all destination swack resources being + * in use, and a timeout is pending. In that case hardware immediately + * returns the ERROR that looks like a destination timeout. + */ +static void +destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + udelay(bcp->plugged_delay); + bcp->plugged_tries++; + if (bcp->plugged_tries >= bcp->plugsb4reset) { + bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_plug++; + } +} + +static void +destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + hmaster->max_bau_concurrent = 1; + bcp->timeout_tries++; + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { + bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } +} + /* * Completions are taking a very long time due to a congested numalink * network. @@ -518,7 +559,7 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * * Send a broadcast and wait for it to complete. * - * The flush_mask contains the cpus the broadcast is to be sent to, plus + * The flush_mask contains the cpus the broadcast is to be sent to including * cpus that are on the local uvhub. * * Returns 0 if all flushing represented in the mask was done. @@ -553,7 +594,6 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)); } - while (hmaster->uvhub_quiesce) cpu_relax(); @@ -584,40 +624,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, right_shift, this_cpu, bcp, smaster, try); if (completion_status == FLUSH_RETRY_PLUGGED) { - /* - * Our retries may be blocked by all destination swack - * resources being consumed, and a timeout pending. In - * that case hardware immediately returns the ERROR - * that looks like a destination timeout. - */ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_plug++; - } + destination_plugged(bau_desc, bcp, hmaster, stat); } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_bau_concurrent = 1; - bcp->timeout_tries++; - udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } + destination_timeout(bau_desc, bcp, hmaster, stat); } if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; @@ -628,10 +637,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } while ((completion_status == FLUSH_RETRY_PLUGGED) || (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - bcp->plugged_tries = 0; bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > bcp->complete_threshold) && (hmaster->max_bau_concurrent < @@ -740,7 +747,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); /* cpu statistics */ -- cgit v1.2.3 From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2010 21:35:17 +0200 Subject: x86: Unify dumpstack.h and stacktrace.h arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h declare headers of objects that deal with the same topic. Actually most of the files that include stacktrace.h also include dumpstack.h Although dumpstack.h seems more reserved for internals of stack traces, those are quite often needed to define specialized stack trace operations. And perf event arch headers are going to need access to such low level operations anyway. So don't continue to bother with dumpstack.h as it's not anymore about isolated deep internals. v2: fix struct stack_frame definition conflict in sysprof Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Soeren Sandmann --- arch/x86/include/asm/stacktrace.h | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.c | 2 -- arch/x86/kernel/dumpstack.c | 1 - arch/x86/kernel/dumpstack.h | 56 --------------------------------------- arch/x86/kernel/dumpstack_32.c | 2 -- arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/stacktrace.c | 7 ++--- 7 files changed, 56 insertions(+), 65 deletions(-) delete mode 100644 arch/x86/kernel/dumpstack.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 4dab78edbad..a957463d3c7 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -1,6 +1,13 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H +#include + extern int kstack_depth_to_print; struct thread_info; @@ -42,4 +49,49 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data); +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } +#endif + + return (unsigned long)frame; +} + #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c77586061bc..9632fb61e8f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386930b..6e8752c1bd5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,7 +18,6 @@ #include -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd4..00000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -#include - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -struct stack_frame_ia32 { - u32 next_frame; - u32 return_address; -}; - -static inline unsigned long rewind_frame_pointer(int n) -{ - struct stack_frame *frame; - - get_bp(frame); - -#ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } -#endif - - return (unsigned long)frame; -} - -#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d9..0f6376ffa2d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,6 @@ #include -#include "dumpstack.h" - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f..57a21f11c79 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,7 +16,6 @@ #include -#include "dumpstack.h" #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6..ea54d029fe2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; -- cgit v1.2.3 From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 20 May 2010 07:47:21 +0200 Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller Drop this argument now that we always want to rewind only to the state of the first caller. It means frame pointers are not necessary anymore to reliably get the source of an event. But this also means we need this helper to be a macro now, as an inline function is not an option since we need to know when to provide a default implentation. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul Mackerras Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 13 +++++++++++++ arch/x86/include/asm/stacktrace.h | 7 ++----- arch/x86/kernel/cpu/perf_event.c | 16 ---------------- 3 files changed, 15 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 254883d0c7e..02de29830ff 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) +#include + +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ +} + #else static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a957463d3c7..2b16a2ad23d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,17 +78,14 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long rewind_frame_pointer(int n) +static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; get_bp(frame); #ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } + frame = frame->next_frame; #endif return (unsigned long)frame; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9632fb61e8f..2c075fe573d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - /* - * We abuse bit 3 to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. - */ - regs->flags = 0; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; -- cgit v1.2.3 From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 17:49:05 +0200 Subject: perf: Cleanup {start,commit,cancel}_txn details Clarify some of the transactional group scheduling API details and change it so that a successfull ->commit_txn also closes the transaction. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274803086.5882.1752.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a1..af04c6fa59c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto out; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event) * The events never got scheduled and ->cancel_txn will truncate * the event_list. */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) return; x86_pmu_stop(event); @@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuc->group_flag &= ~PERF_EVENT_TXN; /* * Truncate the collected events. */ @@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - /* - * Clear out the txn count so that ->cancel_txn() which gets - * run after ->commit_txn() doesn't undo things. - */ - cpuc->n_txn = 0; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } -- cgit v1.2.3 From 68aa00ac0a82e9a876c799bf6be7622b8f1c8517 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 3 Jun 2010 01:23:04 +0400 Subject: perf, x86: Make a second write to performance counter if needed On Netburst PMU we need a second write to a performance counter due to cpu erratum. A simple flag test instead of alternative instructions was choosen because wrmsrl is already a macro and if virtualization is turned on will need an additional wrapper call which is more expencise. nb: we should propably switch to jump-labels as only this facility reach the mainline. Signed-off-by: Cyrill Gorcunov Signed-off-by: Peter Zijlstra Cc: Robert Richter Cc: Lin Ming Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20100602212304.GC5264@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 12 +++++++++++- arch/x86/kernel/cpu/perf_event_p4.c | 9 +++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index af04c6fa59c..79e199843db 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -220,6 +220,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; void (*quirks)(void); + int perfctr_second_write; int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); @@ -925,8 +926,17 @@ x86_perf_event_set_period(struct perf_event *event) */ atomic64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + } perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69644d..9286e736a70 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -829,6 +829,15 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << 39) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, }; static __init int p4_pmu_init(void) -- cgit v1.2.3 From 1996bda2a42480c275656233e631ee0966574be4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:05:13 +0200 Subject: arch: Implement local64_t On 64bit, local_t is of size long, and thus we make local64_t an alias. On 32bit, we fall back to atomic64_t. (architecture can provide optimized 32-bit version) (This new facility is to be used by perf events optimizations.) Signed-off-by: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/local64.h | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/x86/include/asm/local64.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h new file mode 100644 index 00000000000..36c93b5cc23 --- /dev/null +++ b/arch/x86/include/asm/local64.h @@ -0,0 +1 @@ +#include -- cgit v1.2.3 From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:08 +0200 Subject: perf: Convert perf_event to local_t Since now all modification to event->count (and ->prev_count and ->period_left) are local to a cpu, change then to local64_t so we avoid the LOCK'ed ops. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79e199843db..2d0d2906927 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -314,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -886,7 +886,7 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; @@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); -- cgit v1.2.3 From 147ec4d2361e355ab32499f739cc24845ceb89da Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:39 +0200 Subject: x86: Make save_stack_address() !CONFIG_FRAME_POINTER friendly If CONFIG_FRAME_POINTER=n, print_context_stack() shouldn't neglect the non-reliable addresses on stack, this is all we have if dump_trace(bp) is called with the wrong or zero bp. For example, /proc/pid/stack doesn't work if CONFIG_FRAME_POINTER=n. This patch obviously has no effect if CONFIG_FRAME_POINTER=y, otherwise it reverts 1650743c "x86: don't save unreliable stack trace entries". Also, remove the unnecessary type-cast. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar Cc: Andrew Morton LKML-Reference: <20100603193239.GA31530@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index ea54d029fe2..abc321d5587 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -26,8 +26,10 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (trace->skip > 0) { trace->skip--; return; @@ -39,9 +41,11 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; + struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (in_sched_functions(addr)) return; if (trace->skip > 0) { -- cgit v1.2.3 From 018378c55b03f88ff513aba4e0e93b8d4a9cf241 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:43 +0200 Subject: x86: Unify save_stack_address() and save_stack_address_nosched() Cleanup. Factor the common code in save_stack_address() and save_stack_address_nosched(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar LKML-Reference: <20100603193243.GA31534@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index abc321d5587..b53c525368a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -23,13 +23,16 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static void +__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) return; #endif + if (nosched && in_sched_functions(addr)) + return; if (trace->skip > 0) { trace->skip--; return; @@ -38,22 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) trace->entries[trace->nr_entries++] = addr; } +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, false); +} + static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return; -#endif - if (in_sched_functions(addr)) - return; - if (trace->skip > 0) { - trace->skip--; - return; - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = addr; + return __save_stack_address(data, addr, reliable, true); } static const struct stacktrace_ops save_stack_ops = { -- cgit v1.2.3 From d6d4d4205cf4ce4ba13bc320305afbda25303496 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 3 Jun 2010 12:07:46 +0200 Subject: x86, xsave: Cleanup return codes in check_for_xstate() The places which call check_for_xstate() only care about zero or non-zero so this patch doesn't change how the code runs, but it's a cleanup. The main reason for this patch is that I'm looking for places which don't return -EFAULT for copy_from_user() failures. Signed-off-by: Dan Carpenter LKML-Reference: <20100603100746.GU5483@bicker> Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/kernel/xsave.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24..980149867a1 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], sizeof(struct _fpx_sw_bytes)); - if (err) - return err; + return -EFAULT; /* * First Magic check failed. */ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -1; + return -EINVAL; /* * Check for error scenarios. @@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, if (fx_sw_user->xstate_size < min_xstate_size || fx_sw_user->xstate_size > xstate_size || fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -1; + return -EINVAL; err = __get_user(magic2, (__u32 *) (((void *)fpstate) + fx_sw_user->extended_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (err || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (magic2 != FP_XSTATE_MAGIC2) + return -EFAULT; return 0; } -- cgit v1.2.3 From 8cc1176e5de534d55cb26ff0cef3fd0d6ad8c3c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 18:18:40 +0200 Subject: x86, cacheinfo: Carve out L3 cache slot accessors This is in preparation for disabling L3 cache indices after having received correctable ECCs in the L3 cache. Now we allow for initial setting of a disabled index slot (write once) and deny writing new indices to it after it has been disabled. Also, we deny using both slots to disable one and the same index. Userspace can restore the previously disabled indices by rewriting those sysfs entries when booting. Cleanup and reorganize code while at it. Signed-off-by: Borislav Petkov LKML-Reference: <20100602161840.GI18327@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 108 ++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae2062cf..898c2f4eab8 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) return l3; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, + int index) { int node; @@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3 = l3_caches[node]; } +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int slot) { - struct pci_dev *dev = this_leaf->l3->dev; - unsigned int reg = 0; + int index; if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (!dev) - return -EINVAL; + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); - pci_read_config_dword(dev, 0x1BC + slot * 4, ®); - return sprintf(buf, "0x%08x\n", reg); + return sprintf(buf, "FREE\n"); } #define SHOW_CACHE_DISABLE(slot) \ @@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, } } - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) { - struct pci_dev *dev = this_leaf->l3->dev; - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - unsigned long val = 0; + int ret = 0; #define SUBCACHE_MASK (3UL << 20) #define SUBCACHE_INDEX 0xfff - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + /* + * check whether this slot is already used or + * the index is already disabled + */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) return -EINVAL; + /* + * check whether the other slot has disabled the + * same index already + */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((index & SUBCACHE_INDEX) > l3->indices)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!dev) + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - amd_l3_disable_index(this_leaf->l3, cpu, slot, val); - + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } return count; } @@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, #else /* CONFIG_CPU_SUP_AMD */ static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; #endif /* CONFIG_CPU_SUP_AMD */ @@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(index, this_leaf); + amd_check_l3_disable(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } -- cgit v1.2.3 From 12d8a961289644d265d8b3e88201878837c3b814 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 20:29:21 +0200 Subject: x86, AMD: Extend support to future families Extend support to future families, and in particular: * extend direct mapping split of Tseg SMM area. * extend K8 flavored alternatives (NOPS). * rep movs* prefix is fast in ucode. Signed-off-by: Borislav Petkov LKML-Reference: <20100602182921.GA21557@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d..12b9cff047c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* -- cgit v1.2.3 From 1f9a0bd4989fd16842ad71fc89240b48ab191446 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:08 +0800 Subject: x86, mce: Rename MSR_IA32_MCx_CTL2 value Rename CMCI_EN to MCI_CTL2_CMCI_EN and CMCI_THRESHOLD_MASK to MCI_CTL2_CMCI_THRESHOLD_MASK to make naming consistent. Signed-off-by: Huang Ying LKML-Reference: <1275977348.3444.659.camel@yhuang-dev.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 4 ++++ arch/x86/include/asm/msr-index.h | 3 --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f32a4301c4d..82db1d8f064 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,10 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +/* CTL2 register defines */ +#define MCI_CTL2_CMCI_EN (1ULL << 30) +#define MCI_CTL2_CMCI_THRESHOLD_MASK 0xffffULL + #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) #define MCJ_CTX_RANDOM 0 /* inject context: random */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b49d8ca228f..38f66eb5854 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -94,9 +94,6 @@ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) -#define CMCI_EN (1ULL << 30) -#define CMCI_THRESHOLD_MASK 0xffffULL - #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920..faf7b2919a8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,19 +95,19 @@ static void cmci_discover(int banks, int boot) rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); @@ -155,7 +155,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } -- cgit v1.2.3 From 3c417588603e5411f29d22a40f3b5ff71529a4f0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:10 +0800 Subject: x86, mce: Fix MSR_IA32_MCI_CTL2 CMCI threshold setup It is reported that CMCI is not raised when number of corrected error reaches preset threshold. After inspection, it is found that MSR_IA32_MCI_CTL2 threshold field is not setup properly. This patch fixed it. Value of MCI_CTL2_CMCI_THRESHOLD_MASK is fixed according to x86_64 Software Developer's Manual too. Reported-by: Shaohui Zheng Signed-off-by: Huang Ying LKML-Reference: <1275977350.3444.660.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 82db1d8f064..c62c13cb978 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,7 +40,7 @@ /* CTL2 register defines */ #define MCI_CTL2_CMCI_EN (1ULL << 30) -#define MCI_CTL2_CMCI_THRESHOLD_MASK 0xffffULL +#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index faf7b2919a8..6fcd0936194 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -102,6 +102,7 @@ static void cmci_discover(int banks, int boot) continue; } + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); -- cgit v1.2.3 From a2d7b0d4852536273b65d16fe179c65184fe5e2d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:35:39 +0800 Subject: x86, mce: Use HW_ERR in MCE handler Use HW_ERR printk prefix in MCE handler. To make it more explicit that this is hardware error instead of software error. Signed-off-by: Huang Ying LKML-Reference: <1275978939.3444.668.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc4256225..094b228c8b0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); static int default_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); + pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); return NOTIFY_STOP; } @@ -211,11 +211,11 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -224,14 +224,14 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); /* @@ -241,16 +241,6 @@ static void print_mce(struct mce *m) atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); -} - #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; @@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; @@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) apei_err = apei_write_mce(final); } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -1220,7 +1208,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } -- cgit v1.2.3 From ec8c27e04f89a7575ca2c4facb99152e03d6a99c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Apr 2010 06:45:36 -0700 Subject: mce: convert to rcu_dereference_index_check() The mce processing applies rcu_dereference_check() to integers used as array indices. This patch therefore moves mce to the new RCU API rcu_dereference_index_check() that avoids the sparse processing that would otherwise result in compiler errors. Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc4256225..0e78657e29c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,7 +51,7 @@ static DEFINE_MUTEX(mce_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_check((p), \ + rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_read_mutex)) -- cgit v1.2.3 From 421f91d21ad6f799dc7b489bb33cc560ccc56f98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 11 Jun 2010 12:17:00 +0200 Subject: fix typos concerning "initiali[zs]e" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/head32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e0161..192cd7ee35c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -459,7 +459,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) } /* - * Setup the local APIC timer for this CPU. Copy the initilized values + * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e24603739..784360c0625 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -20,7 +20,7 @@ static void __init i386_default_early_setup(void) { - /* Initilize 32bit specific setup functions */ + /* Initialize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; -- cgit v1.2.3 From 23016bf0d25d62c45d8b8f61d55b290d704f7a79 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 3 Jun 2010 23:22:28 -0400 Subject: x86: Look for IA32_ENERGY_PERF_BIAS support The new IA32_ENERGY_PERF_BIAS MSR allows system software to give hardware a hint whether OS policy favors more power saving, or more performance. This allows the OS to have some influence on internal hardware power/performance tradeoffs where the OS has previously had no influence. The support for this feature is indicated by CPUID.06H.ECX.bit3, as documented in the Intel Architectures Software Developer's Manual. This patch discovers support of this feature and displays it as "epb" in /proc/cpuinfo. Signed-off-by: Venkatesh Pallipadi LKML-Reference: Signed-off-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 3 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 46814591438..2a904f4071f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -162,6 +162,7 @@ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ /* Virtualization flags: Linux defined */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b49d8ca228f..e57bc20683d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -238,6 +238,8 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 10fa5684a66..7369b4c2c55 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, -- cgit v1.2.3 From 8b8f79b927b6b302bb65fb8c56e7a19be5fbdbef Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 13 Jun 2010 23:56:54 +0200 Subject: x86, kmmio/mmiotrace: Fix double free of kmmio_fault_pages After every iounmap mmiotrace has to free kmmio_fault_pages, but it can't do it directly, so it defers freeing by RCU. It usually works, but when mmiotraced code calls ioremap-iounmap multiple times without sleeping between (so RCU won't kick in and start freeing) it can be given the same virtual address, so at every iounmap mmiotrace will schedule the same pages for release. Obviously it will explode on second free. Fix it by marking kmmio_fault_pages which are scheduled for release and not adding them second time. Signed-off-by: Marcin Slusarz Tested-by: Marcin Kocielnicki Tested-by: Shinpei KATO Acked-by: Pekka Paalanen Cc: Stuart Bennett Cc: Marcin Kocielnicki Cc: nouveau@lists.freedesktop.org Cc: LKML-Reference: <20100613215654.GA3829@joi.lan> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 16 +++++++++++++--- arch/x86/mm/testmmiotrace.c | 22 ++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 5d0e67fff1a..e5d5e2ce9f7 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -45,6 +45,8 @@ struct kmmio_fault_page { * Protected by kmmio_lock, when linked into kmmio_page_table. */ int count; + + bool scheduled_for_release; }; struct kmmio_delayed_release { @@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page, BUG_ON(f->count < 0); if (!f->count) { disarm_kmmio_fault_page(f); - f->release_next = *release_list; - *release_list = f; + if (!f->scheduled_for_release) { + f->release_next = *release_list; + *release_list = f; + f->scheduled_for_release = true; + } } } @@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) prevp = &f->release_next; } else { *prevp = f->release_next; + f->release_next = NULL; + f->scheduled_for_release = false; } - f = f->release_next; + f = *prevp; } spin_unlock_irqrestore(&kmmio_lock, flags); @@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p) kmmio_count--; spin_unlock_irqrestore(&kmmio_lock, flags); + if (!release_list) + return; + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { pr_crit("leaking kmmio_fault_page objects.\n"); diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 8565d944f7c..38868adf07e 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -90,6 +90,27 @@ static void do_test(unsigned long size) iounmap(p); } +/* + * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in + * a short time. We had a bug in deferred freeing procedure which tried + * to free this region multiple times (ioremap can reuse the same address + * for many mappings). + */ +static void do_test_bulk_ioremapping(void) +{ + void __iomem *p; + int i; + + for (i = 0; i < 10; ++i) { + p = ioremap_nocache(mmio_address, PAGE_SIZE); + if (p) + iounmap(p); + } + + /* Force freeing. If it will crash we will know why. */ + synchronize_rcu(); +} + static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); @@ -104,6 +125,7 @@ static int __init init(void) "and writing 16 kB of rubbish in there.\n", size >> 10, mmio_address); do_test(size); + do_test_bulk_ioremapping(); pr_info("All done.\n"); return 0; } -- cgit v1.2.3 From c882e0feb937af4e5b991cbd1c81536f37053e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Sch=C3=B6ne?= Date: Mon, 14 Jun 2010 13:37:20 +0200 Subject: x86, perf: Add power_end event to process_*.c cpu_idle routine Systems using the idle thread from process_32.c and process_64.c do not generate power_end events which could be traced using perf. This patch adds the event generation for such systems. Signed-off-by: Robert Schoene Acked-by: Arjan van de Ven Cc: Peter Zijlstra LKML-Reference: <1276515440.5441.45.camel@localhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++++ arch/x86/kernel/process_64.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af4..96586c3cbbb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,6 +57,8 @@ #include #include +#include + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -111,6 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a99f1..3d9ea531ddd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,8 @@ #include #include +#include + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -138,6 +140,9 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); + /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ -- cgit v1.2.3 From d7a0380dc3e6607d30ccdfc3cfc2ccee0d966716 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 16 Jun 2010 22:30:42 +0200 Subject: x86-64, mm: Initialize VDSO earlier on 64 bits When initrd is in use and a driver does request_module() in its module_init (i.e. __initcall or device_initcall), a modprobe process is created with VDSO mapping. But VDSO is inited even in __initcall, i.e. on the same level (at the same time), so it may not be inited yet (link order matters). Move the VDSO initialization code earlier by switching to something before rootfs_initcall where initrd is loaded as rootfs. Specifically to subsys_initcall. Do it for standard 64-bit path (init_vdso_vars) and for compat (sysenter_setup), just in case people have 32-bit initrd and ia32 emulation built-in. i386 (pure 32-bit) is not affected, since sysenter_setup() is called from check_bugs()->identify_boot_cpu() in start_kernel() before rest_init()->kernel_thread(kernel_init) where even kernel_init() calls do_basic_setup()->do_initcalls(). What this patch fixes are early modprobe crashes such as: Unpacking initramfs... Freeing initrd memory: 9324k freed modprobe[368]: segfault at 7fff4429c020 ip 00007fef397e160c \ sp 00007fff442795c0 error 4 in ld-2.11.2.so[7fef397df000+1f000] Signed-off-by: Jiri Slaby LKML-Reference: <1276720242-13365-1-git-send-email-jslaby@suse.cz> Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso32-setup.c | 2 +- arch/x86/vdso/vma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 02b442e9200..36df991985b 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_X86_64 -__initcall(sysenter_setup); +subsys_initcall(sysenter_setup); #ifdef CONFIG_SYSCTL /* Register vsyscall32 into the ABI table */ diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869b814..43456ee1769 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -74,7 +74,7 @@ static int __init init_vdso_vars(void) vdso_enabled = 0; return -ENOMEM; } -__initcall(init_vdso_vars); +subsys_initcall(init_vdso_vars); struct linux_binprm; -- cgit v1.2.3 From 05d0b0889ca9d033a960542af7f8a13b3ad4f630 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Jun 2010 14:36:26 -0700 Subject: x86, vdso: Error out if the vdso contains external references The vdso is a piece of userspace code which is supposed to be fully self-contained. Any external (undefined) reference is an error, and should be caught at compile time. This was giving us trouble when compiling with -Os on gcc 4.5.0, for example (failed inline). The need to do a buildtime check was pointed out by Andi Kleen. Reported-by: Andi Kleen LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 3 ++- arch/x86/vdso/checkundef.sh | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100755 arch/x86/vdso/checkundef.sh (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 6b4ffedb93c..4a2afa1bac5 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE quiet_cmd_vdso = VDSO $@ cmd_vdso = $(CC) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) GCOV_PROFILE := n diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh new file mode 100755 index 00000000000..490be1c38f9 --- /dev/null +++ b/arch/x86/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +"$nm" "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi -- cgit v1.2.3 From fd699c76552bbfa66631f019be415a87dbb08237 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Fri, 18 Jun 2010 17:46:53 -0400 Subject: x86, olpc: Add support for calling into OpenFirmware Add support for saving OFW's cif, and later calling into it to run OFW commands. OFW remains resident in memory, living within virtual range 0xff800000 - 0xffc00000. A single page directory entry points to the pgdir that OFW actually uses, so rather than saving the entire page table, we grab and install that one entry permanently in the kernel's page table. This is currently only used by the OLPC XO. Note that this particular calling convention breaks PAE and PAT, and so cannot be used on newer x86 hardware. Signed-off-by: Andres Salomon LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 9 ++++ arch/x86/include/asm/bootparam.h | 11 ++++- arch/x86/include/asm/olpc_ofw.h | 31 ++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head_32.S | 6 +++ arch/x86/kernel/olpc.c | 12 ++--- arch/x86/kernel/olpc_ofw.c | 104 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 6 +++ 8 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 arch/x86/include/asm/olpc_ofw.h create mode 100644 arch/x86/kernel/olpc_ofw.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a6..71c194db2e0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2062,6 +2062,15 @@ config OLPC Add support for detecting the unique features of the OLPC XO hardware. +config OLPC_OPENFIRMWARE + bool "Support for OLPC's Open Firmware" + depends on !X86_64 && !X86_PAE + default y if OLPC + help + This option adds support for the implementation of Open Firmware + that is used on the OLPC XO-1 Children's Machine. + If unsure, say N here. + endif # X86_32 config K8_NB diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 6be33d83c71..8e6218550e7 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -70,6 +70,14 @@ struct sys_desc_table { __u8 table[14]; }; +/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */ +struct olpc_ofw_header { + __u32 ofw_magic; /* OFW signature */ + __u32 ofw_version; + __u32 cif_handler; /* callback into OFW */ + __u32 irq_desc_table; +} __attribute__((packed)); + struct efi_info { __u32 efi_loader_signature; __u32 efi_systab; @@ -92,7 +100,8 @@ struct boot_params { __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* 0x0a0 */ - __u8 _pad4[144]; /* 0x0b0 */ + struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ + __u8 _pad4[128]; /* 0x0c0 */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h new file mode 100644 index 00000000000..3e63d857c48 --- /dev/null +++ b/arch/x86/include/asm/olpc_ofw.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_OLPC_OFW_H +#define _ASM_X86_OLPC_OFW_H + +/* index into the page table containing the entry OFW occupies */ +#define OLPC_OFW_PDE_NR 1022 + +#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ + +#ifdef CONFIG_OLPC_OPENFIRMWARE + +/* run an OFW command by calling into the firmware */ +#define olpc_ofw(name, args, res) \ + __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) + +extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, + void **res); + +/* determine whether OFW is available and lives in the proper memory */ +extern void olpc_ofw_detect(void); + +/* install OFW's pde permanently into the kernel's pgtable */ +extern void setup_olpc_ofw_pgd(void); + +#else /* !CONFIG_OLPC_OPENFIRMWARE */ + +static inline void olpc_ofw_detect(void) { } +static inline void setup_olpc_ofw_pgd(void) { } + +#endif /* !CONFIG_OLPC_OPENFIRMWARE */ + +#endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b2208372..0925676266b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d8..ff4c453e13f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -131,6 +131,12 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_OLPC_OPENFIRMWARE + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b..156605281f5 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -21,10 +21,7 @@ #include #include #include - -#ifdef CONFIG_OPEN_FIRMWARE -#include -#endif +#include struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); @@ -188,14 +185,15 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OPEN_FIRMWARE +#ifdef CONFIG_OLPC_OPENFIRMWARE static void __init platform_detect(void) { size_t propsize; __be32 rev; + void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + void *res[] = { &propsize }; - if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, - &propsize) || propsize != 4) { + if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); rev = cpu_to_be32(0); } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 00000000000..469ee438429 --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* address of OFW callback interface; will be NULL if OFW isn't found */ +static int (*olpc_ofw_cif)(int *); + +/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ +u32 olpc_ofw_pgd __initdata; + +static DEFINE_SPINLOCK(ofw_lock); + +#define MAXARGS 10 + +void __init setup_olpc_ofw_pgd(void) +{ + pgd_t *base, *ofw_pde; + + if (!olpc_ofw_cif) + return; + + /* fetch OFW's PDE */ + base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); + if (!base) { + printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); + olpc_ofw_cif = NULL; + return; + } + ofw_pde = &base[OLPC_OFW_PDE_NR]; + + /* install OFW's PDE permanently into the kernel's pgtable */ + set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); +} + +int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, + void **res) +{ + int ofw_args[MAXARGS + 3]; + unsigned long flags; + int ret, i, *p; + + BUG_ON(nr_args + nr_res > MAXARGS); + + if (!olpc_ofw_cif) + return -EIO; + + ofw_args[0] = (int)name; + ofw_args[1] = nr_args; + ofw_args[2] = nr_res; + + p = &ofw_args[3]; + for (i = 0; i < nr_args; i++, p++) + *p = (int)args[i]; + + /* call into ofw */ + spin_lock_irqsave(&ofw_lock, flags); + ret = olpc_ofw_cif(ofw_args); + spin_unlock_irqrestore(&ofw_lock, flags); + + if (!ret) { + for (i = 0; i < nr_res; i++, p++) + *((int *)res[i]) = *p; + } + + return ret; +} +EXPORT_SYMBOL_GPL(__olpc_ofw); + +/* OFW cif _should_ be above this address */ +#define OFW_MIN 0xff000000 + +/* OFW starts on a 1MB boundary */ +#define OFW_BOUND (1<<20) + +void __init olpc_ofw_detect(void) +{ + struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; + unsigned long start; + + /* ensure OFW booted us by checking for "OFW " string */ + if (hdr->ofw_magic != OLPC_OFW_SIG) + return; + + olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; + + if ((unsigned long)olpc_ofw_cif < OFW_MIN) { + printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", + (unsigned long)olpc_ofw_cif); + olpc_ofw_cif = NULL; + return; + } + + /* determine where OFW starts in memory */ + start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); + printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", + (unsigned long)olpc_ofw_cif, (-start) >> 20); + reserve_top_address(-start); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd03..b008e788320 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -102,6 +102,7 @@ #include #include +#include #include #include @@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + /* OFW also may relocate the fixmap */ + olpc_ofw_detect(); + early_trap_init(); early_cpu_init(); early_ioremap_init(); + setup_olpc_ofw_pgd(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; -- cgit v1.2.3 From 75a9cac430a1bd2a5219c74508ca01b0ddfddc9a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 23 Jun 2010 20:27:00 -0400 Subject: x86, olpc: Add comment about implicit optimization barrier Signed-off-by: Andres Salomon Cc: H. Peter Anvin LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc_ofw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index 469ee438429..f5d499fbe74 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -35,6 +35,8 @@ void __init setup_olpc_ofw_pgd(void) /* install OFW's PDE permanently into the kernel's pgtable */ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + /* implicit optimization barrier here due to uninline function return */ + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -- cgit v1.2.3 From 0c4519e825c9e2b6a8310deff8582f8c35bfbba9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 21:21:27 +0200 Subject: x86: Set resume bit before returning from breakpoint exception Instruction breakpoints trigger before the instruction executes, and returning back from the breakpoint handler brings us again to the instruction that breakpointed. This naturally bring to a breakpoint recursion. To solve this, x86 has the Resume Bit trick. When the cpu flags have the RF flag set, the next instruction won't trigger any instruction breakpoint, and once this instruction is executed, RF is cleared back. This let's us jump back to the instruction that triggered the breakpoint without recursion. Use this when an instruction breakpoint triggers. Signed-off-by: Frederic Weisbecker Cc: Will Deacon Cc: Prasad Cc: Mahesh Salgaonkar Cc: Paul Mackerras Cc: Ingo Molnar Cc: Jason Wessel --- arch/x86/kernel/hw_breakpoint.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b803d2f..eaa6ae2a010 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -466,6 +466,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) perf_bp_event(bp, args->regs); + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + args->regs->flags |= X86_EFLAGS_RF; + rcu_read_unlock(); } /* -- cgit v1.2.3 From f7809daf64bf119fef70af172db6a0636fa51f92 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 10:00:24 +0200 Subject: x86: Support for instruction breakpoints Instruction breakpoints need to have a specific length of 0 to be working. Bring this support but also take care the user is not trying to set an unsupported length, like a range breakpoint for example. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Prasad Cc: Mahesh Salgaonkar Cc: Will Deacon Cc: Jason Wessel --- arch/x86/include/asm/hw_breakpoint.h | 2 +- arch/x86/kernel/hw_breakpoint.c | 44 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 942255310e6..528a11e8d3e 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -20,10 +20,10 @@ struct arch_hw_breakpoint { #include /* Available HW breakpoint length encodings */ +#define X86_BREAKPOINT_LEN_X 0x00 #define X86_BREAKPOINT_LEN_1 0x40 #define X86_BREAKPOINT_LEN_2 0x44 #define X86_BREAKPOINT_LEN_4 0x4c -#define X86_BREAKPOINT_LEN_EXECUTE 0x40 #ifdef CONFIG_X86_64 #define X86_BREAKPOINT_LEN_8 0x48 diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index eaa6ae2a010..a474ec37c32 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, { /* Len */ switch (x86_len) { + case X86_BREAKPOINT_LEN_X: + *gen_len = sizeof(long); + break; case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) info->address = bp->attr.bp_addr; + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (bp->attr.bp_len == sizeof(long)) { + info->len = X86_BREAKPOINT_LEN_X; + return 0; + } + default: + return -EINVAL; + } + /* Len */ switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: @@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; } - /* Type */ - switch (bp->attr.bp_type) { - case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; - break; - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; - break; - case HW_BREAKPOINT_X: - info->type = X86_BREAKPOINT_EXECUTE; - break; - default: - return -EINVAL; - } - return 0; } /* @@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { + case X86_BREAKPOINT_LEN_X: + align = sizeof(long) -1; + break; case X86_BREAKPOINT_LEN_1: align = 0; break; -- cgit v1.2.3 From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 29 Jun 2010 10:07:14 +0200 Subject: workqueue: increase max_active of keventd and kill current_is_keventd() Define WQ_MAX_ACTIVE and create keventd with max_active set to half of it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1 works concurrently. Unless some combination can result in dependency loop longer than max_active, deadlock won't happen and thus it's unnecessary to check whether current_is_keventd() before trying to schedule a work. Kill current_is_keventd(). (Lockdep annotations are broken. We need lock_map_acquire_read_norecurse()) Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Tony Luck Cc: Andi Kleen Cc: Oleg Nesterov --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d..4d90f376e98 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up() || current_is_keventd()) + if (!keventd_up()) c_idle.work.func(&c_idle.work); else { schedule_work(&c_idle.work); -- cgit v1.2.3 From 567a9fd86735ccdc897768ed2dacdd5e83a13509 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Jun 2010 14:53:50 +0900 Subject: kprobes/x86: Fix kprobes to skip prefixes correctly Fix resume_execution() and is_IF_modifier() to skip x86 instruction prefixes correctly by using x86 instruction attribute. Without this fix, resume_execution() can't handle instructions which have non-REX prefixes (REX prefixes are skipped). This will cause unexpected kernel panic by hitting bad address when a kprobe hits on two-byte ret (e.g. "repz ret" generated for Athlon/K8 optimization), because it just checks "repz" and can't recognize the "ret" instruction. These prefixes can be found easily with x86 instruction attribute. This patch introduces skip_prefixes() and uses it in resume_execution() and is_IF_modifier() to skip prefixes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli LKML-Reference: <4C298A6E.8070609@hitachi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 345a4b1fe14..175f85ceace 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) } /* - * Check for the REX prefix which can only exist on X86_64 - * X86_32 always returns 0 + * Skip the prefixes of the instruction. */ -static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } #ifdef CONFIG_X86_64 - if ((*insn & 0xf0) == 0x40) - return 1; + if (inat_is_rex_prefix(attr)) + insn++; #endif - return 0; + return insn; } /* @@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) */ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) { + /* Skip prefixes */ + insn = skip_prefixes(insn); + switch (*insn) { case 0xfa: /* cli */ case 0xfb: /* sti */ @@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) return 1; } - /* - * on X86_64, 0x40-0x4f are REX prefixes so we need to look - * at the next byte instead.. but of course not recurse infinitely - */ - if (is_REX_prefix(insn)) - return is_IF_modifier(++insn); - return 0; } @@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, unsigned long orig_ip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; - /*skip the REX prefix*/ - if (is_REX_prefix(insn)) - insn++; + /* Skip prefixes */ + insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; switch (*insn) { -- cgit v1.2.3 From ea812ca1b06113597adcd8e70c0f84a413d97544 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 29 Jun 2010 18:38:00 +0000 Subject: x86: Align skb w/ start of cacheline on newer core 2/Xeon Arch x86 architectures can handle unaligned accesses in hardware, and it has been shown that unaligned DMA accesses can be expensive on Nehalem architectures. As such we should overwrite NET_IP_ALIGN to resolve this issue. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Acked-by: H. Peter Anvin Signed-off-by: David S. Miller --- arch/x86/include/asm/system.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index b8fe48ee2ed..b4293fc8b79 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -457,4 +457,13 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } +#ifdef CONFIG_MCORE2 +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 +#endif #endif /* _ASM_X86_SYSTEM_H */ -- cgit v1.2.3 From 7475271004b66e9c22e1bb28f240a38c5d6fe76e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 Jul 2010 13:28:27 +0000 Subject: x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN. It is based on a suggestion from Andi Kleen. The assumption is that there are not any x86 cores where unaligned access is really slow, and this change would allow for a performance improvement to still exist on configurations that are not necessarily optimized for Core 2. Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Acked-by: H. Peter Anvin Signed-off-by: David S. Miller --- arch/x86/include/asm/system.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index b4293fc8b79..1db9bd2281d 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -457,7 +457,6 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } -#ifdef CONFIG_MCORE2 /* * We handle most unaligned accesses in hardware. On the other hand * unaligned DMA can be quite expensive on some Nehalem processors. @@ -465,5 +464,4 @@ static inline void rdtsc_barrier(void) * Based on this we disable the IP header alignment in network drivers. */ #define NET_IP_ALIGN 0 -#endif #endif /* _ASM_X86_SYSTEM_H */ -- cgit v1.2.3 From 39ef13a4ac28aa64cfe1bc36e6e00f1096707a28 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 5 Jul 2010 10:09:29 +0800 Subject: perf, x86: P4 PMU -- redesign cache events To support cache events we have reserved the low 6 bits in hw_perf_event::config (which is a part of CCCR register configuration actually). These bits represent Replay Event mertic enumerated in enum P4_PEBS_METRIC. The caller should not care about which exact bits should be set and how -- the caller just chooses one P4_PEBS_METRIC entity and puts it into the config. The kernel will track it and set appropriate additional MSR registers (metrics) when needed. The reason for this redesign was the PEBS enable bit, which should not be set until DS (and PEBS sampling) support will be implemented properly. TODO ==== - PEBS sampling (note it's tricky and works with _one_ counter only so for HT machines it will be not that easy to handle both threads) - tracking of PEBS registers state, a user might need to turn PEBS off completely (ie no PEBS enable, no UOP_tag) but some other event may need it, such events clashes and should not run simultaneously, at moment we just don't support such events - eventually export user space bits in separate header which will allow user apps to configure raw events more conveniently. Signed-off-by: Cyrill Gorcunov Signed-off-by: Lin Ming Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <1278295769.9540.15.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event_p4.h | 99 ++++++++++++----------- arch/x86/kernel/cpu/perf_event_p4.c | 147 ++++++++++++++++++++++++++--------- 2 files changed, 163 insertions(+), 83 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 64a8ebff06f..def500776b1 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -19,7 +19,6 @@ #define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) #define ARCH_P4_MAX_CCCR (18) -#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2) #define P4_ESCR_EVENT_MASK 0x7e000000U #define P4_ESCR_EVENT_SHIFT 25 @@ -71,10 +70,6 @@ #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) -/* Custom bits in reerved CCCR area */ -#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU - - /* Non HT mask */ #define P4_CCCR_MASK \ (P4_CCCR_OVF | \ @@ -106,8 +101,7 @@ * ESCR and CCCR but rather an only packed value should * be unpacked and written to a proper addresses * - * the base idea is to pack as much info as - * possible + * the base idea is to pack as much info as possible */ #define p4_config_pack_escr(v) (((u64)(v)) << 32) #define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) @@ -130,8 +124,6 @@ t; \ }) -#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK) - #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) @@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr) return escr; } +/* + * This are the events which should be used in "Event Select" + * field of ESCR register, they are like unique keys which allow + * the kernel to determinate which CCCR and COUNTER should be + * used to track an event + */ enum P4_EVENTS { P4_EVENT_TC_DELIVER_MODE, P4_EVENT_BPU_FETCH_REQUEST, @@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES { * a caller should use P4_ESCR_EMASK_NAME helper to * pick the EventMask needed, for example * - * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD) + * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) */ enum P4_ESCR_EMASKS { P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), @@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS { P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), }; -/* P4 PEBS: stale for a while */ -#define P4_PEBS_METRIC_MASK 0x00001fffU -#define P4_PEBS_UOB_TAG 0x01000000U -#define P4_PEBS_ENABLE 0x02000000U - -/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */ -#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001 -#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002 -#define P4_PEBS__dtlb_load_miss_retired 0x3000004 -#define P4_PEBS__dtlb_store_miss_retired 0x3000004 -#define P4_PEBS__dtlb_all_miss_retired 0x3000004 -#define P4_PEBS__tagged_mispred_branch 0x3018000 -#define P4_PEBS__mob_load_replay_retired 0x3000200 -#define P4_PEBS__split_load_retired 0x3000400 -#define P4_PEBS__split_store_retired 0x3000400 - -#define P4_VERT__1stl_cache_load_miss_retired 0x0000001 -#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001 -#define P4_VERT__dtlb_load_miss_retired 0x0000001 -#define P4_VERT__dtlb_store_miss_retired 0x0000002 -#define P4_VERT__dtlb_all_miss_retired 0x0000003 -#define P4_VERT__tagged_mispred_branch 0x0000010 -#define P4_VERT__mob_load_replay_retired 0x0000001 -#define P4_VERT__split_load_retired 0x0000001 -#define P4_VERT__split_store_retired 0x0000002 - -enum P4_CACHE_EVENTS { - P4_CACHE__NONE, - - P4_CACHE__1stl_cache_load_miss_retired, - P4_CACHE__2ndl_cache_load_miss_retired, - P4_CACHE__dtlb_load_miss_retired, - P4_CACHE__dtlb_store_miss_retired, - P4_CACHE__itlb_reference_hit, - P4_CACHE__itlb_reference_miss, - - P4_CACHE__MAX +/* + * P4 PEBS specifics (Replay Event only) + * + * Format (bits): + * 0-6: metric from P4_PEBS_METRIC enum + * 7 : reserved + * 8 : reserved + * 9-11 : reserved + * + * Note we have UOP and PEBS bits reserved for now + * just in case if we will need them once + */ +#define P4_PEBS_CONFIG_ENABLE (1 << 7) +#define P4_PEBS_CONFIG_UOP_TAG (1 << 8) +#define P4_PEBS_CONFIG_METRIC_MASK 0x3f +#define P4_PEBS_CONFIG_MASK 0xff + +/* + * mem: Only counters MSR_IQ_COUNTER4 (16) and + * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling + */ +#define P4_PEBS_ENABLE 0x02000000U +#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U + +#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK) +#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK) + +#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask)) + +enum P4_PEBS_METRIC { + P4_PEBS_METRIC__none, + + P4_PEBS_METRIC__1stl_cache_load_miss_retired, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired, + P4_PEBS_METRIC__dtlb_load_miss_retired, + P4_PEBS_METRIC__dtlb_store_miss_retired, + P4_PEBS_METRIC__dtlb_all_miss_retired, + P4_PEBS_METRIC__tagged_mispred_branch, + P4_PEBS_METRIC__mob_load_replay_retired, + P4_PEBS_METRIC__split_load_retired, + P4_PEBS_METRIC__split_store_retired, + + P4_PEBS_METRIC__max }; #endif /* PERF_EVENT_P4_H */ + diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 9286e736a70..107711bf0ee 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -21,22 +21,36 @@ struct p4_event_bind { char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; -struct p4_cache_event_bind { +struct p4_pebs_bind { unsigned int metric_pebs; unsigned int metric_vert; }; -#define P4_GEN_CACHE_EVENT_BIND(name) \ - [P4_CACHE__##name] = { \ - .metric_pebs = P4_PEBS__##name, \ - .metric_vert = P4_VERT__##name, \ +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ } -static struct p4_cache_event_bind p4_cache_event_bind_map[] = { - P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), }; /* @@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = { }, }; -#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ p4_config_pack_escr(P4_ESCR_EVENT(event) | \ P4_ESCR_EMASK_BIT(event, bit)) | \ - p4_config_pack_cccr(cache_event | \ + p4_config_pack_cccr(metric | \ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) static __initconst const u64 p4_hw_cache_event_ids @@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__1stl_cache_load_miss_retired), + P4_PEBS_METRIC__1stl_cache_load_miss_retired), }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__2ndl_cache_load_miss_retired), + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), }, }, [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_load_miss_retired), + P4_PEBS_METRIC__dtlb_load_miss_retired), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_store_miss_retired), + P4_PEBS_METRIC__dtlb_store_miss_retired), }, }, [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, - P4_CACHE__itlb_reference_hit), + P4_PEBS_METRIC__none), [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, - P4_CACHE__itlb_reference_miss), + P4_PEBS_METRIC__none), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v; + + /* user data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) { + pr_warning("P4 PMU: Unknown event code: %d\n", v); + return -EINVAL; + } + + /* + * it may have some screwed PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { + pr_warning("P4 PMU: PEBS are not supported yet\n"); + return -EINVAL; + } + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { + pr_warning("P4 PMU: Unknown metric code: %d\n", v); + return -EINVAL; + } + + return 0; +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); int rc = 0; - unsigned int evnt; u32 escr, cccr; /* @@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { - /* user data may have out-of-bound event index */ - evnt = p4_config_unpack_event(event->attr.config); - if (evnt >= ARRAY_SIZE(p4_event_bind_map)) { - rc = -EINVAL; + rc = p4_validate_raw_event(event); + if (rc) goto out; - } /* * We don't control raw events so it's up to the caller @@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event) * on HT machine but allow HT-compatible specifics to be * passed on) * + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + * * XXX: HT wide things should check perf_paranoid_cpu() && * CAP_SYS_ADMIN */ event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT)); + p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); } rc = x86_setup_perfctr(event); @@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) return overflow; } +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * noone is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsence from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + */ +} + static inline void p4_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void) continue; p4_pmu_disable_event(event); } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event) int thread = p4_ht_config_thread(hwc->config); u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); unsigned int idx = p4_config_unpack_event(hwc->config); - unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config); struct p4_event_bind *bind; - struct p4_cache_event_bind *bind_cache; u64 escr_addr, cccr; bind = &p4_event_bind_map[idx]; @@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event) cccr = p4_config_unpack_cccr(hwc->config); /* - * it could be Cache event so that we need to - * set metrics into additional MSRs + * it could be Cache event so we need to write metrics + * into additional MSRs */ - BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); - if (idx_cache > P4_CACHE__NONE && - idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) { - bind_cache = &p4_cache_event_bind_map[idx_cache]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert); - } + p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); (void)checking_wrmsrl(hwc->config_base + hwc->idx, -- cgit v1.2.3 From 8e221b6db4477643fefc885a97ea9889ac733140 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 22 Jun 2010 16:23:37 -0700 Subject: x86: Avoid unnecessary __clear_user() and xrstor in signal handling fxsave/xsave doesn't touch all the bytes in the memory layout used by these instructions. Specifically SW reserved (bytes 464..511) fields in the fxsave frame and the reserved fields in the xsave header. To present a clean context for the signal handling, just clear these fields instead of clearing the complete fxsave/xsave memory layout, when we dump these registers directly to the user signal frame. Also avoid the call to second xrstor (which inits the state not passed in the signal frame) in restore_user_xstate() if all the state has already been restored by the first xrstor. These changes improve the performance of signal handling(by ~3-5% as measured by the lat_sig). Signed-off-by: Suresh Siddha LKML-Reference: <1277249017.2847.85.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 9 +++++++++ arch/x86/include/asm/xsave.h | 10 ++++++++++ arch/x86/kernel/xsave.c | 12 ++---------- 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b90..0f1cf5d53dd 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -127,6 +127,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { int err; + /* + * Clear the bytes not touched by the fxsave and reserved + * for the SW usage. + */ + err = __clear_user(&fx->sw_reserved, + sizeof(struct _fpx_sw_bytes)); + if (unlikely(err)) + return -EFAULT; + asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae22..30dfc81804d 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -59,6 +59,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu) static inline int xsave_user(struct xsave_struct __user *buf) { int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->xsave_hdr, + sizeof(struct xsave_hdr_struct)); + if (unlikely(err)) + return -EFAULT; + __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" "2:\n" ".section .fixup,\"ax\"\n" diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24..6e73db1b7b4 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -91,14 +91,6 @@ int save_i387_xstate(void __user *buf) return 0; if (task_thread_info(tsk)->status & TS_USEDFPU) { - /* - * Start with clearing the user buffer. This will present a - * clean context for the bytes not touched by the fxsave/xsave. - */ - err = __clear_user(buf, sig_xstate_size); - if (err) - return err; - if (use_xsave()) err = xsave_user(buf); else @@ -184,8 +176,8 @@ static int restore_user_xstate(void __user *buf) * init the state skipped by the user. */ mask = pcntxt_mask & ~mask; - - xrstor_state(init_xstate_buf, mask); + if (unlikely(mask)) + xrstor_state(init_xstate_buf, mask); return 0; -- cgit v1.2.3 From 24da9c26f3050aee9314ec09930a24c80fe76352 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 7 Jul 2010 10:15:12 -0700 Subject: x86, cpu: Add CPU flags for F16C and RDRND Add support for the newly documented F16C (16-bit floating point conversions) and RDRND (RDRAND instruction) CPU feature flags. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 2a904f4071f..aeb6f3f9b2c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -124,6 +124,8 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -- cgit v1.2.3 From 83a7a2ad2a9173dcabc05df0f01d1d85b7ba1c2c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 10 Jun 2010 00:10:43 +0000 Subject: x86, alternatives: Use 16-bit numbers for cpufeature index We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 7 ++++--- arch/x86/include/asm/cpufeature.h | 14 ++++++++------ arch/x86/kernel/entry_32.S | 2 +- arch/x86/lib/clear_page_64.S | 2 +- arch/x86/lib/copy_page_64.S | 2 +- arch/x86/lib/memcpy_64.S | 2 +- arch/x86/lib/memset_64.S | 2 +- 7 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 03b6bb5394a..bc6abb7bc7e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,10 +45,9 @@ struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; - u8 cpuid; /* cpuid bit set for replacement */ + u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ - u8 pad1; #ifdef CONFIG_X86_64 u32 pad2; #endif @@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end) _ASM_ALIGN "\n" \ _ASM_PTR "661b\n" /* label */ \ _ASM_PTR "663f\n" /* new instruction */ \ - " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ ".previous\n" \ ".section .altinstr_replacement, \"ax\"\n" \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 46814591438..e8b88967de3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -291,7 +291,7 @@ extern const char * const x86_power_flags[32]; * patch the target code for additional performance. * */ -static __always_inline __pure bool __static_cpu_has(u8 bit) +static __always_inline __pure bool __static_cpu_has(u16 bit) { #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) asm goto("1: jmp %l[t_no]\n" @@ -300,11 +300,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) _ASM_ALIGN "\n" _ASM_PTR "1b\n" _ASM_PTR "0\n" /* no replacement */ - " .byte %P0\n" /* feature bit */ + " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ - " .byte 0xff + 0 - (2b-1b)\n" /* padding */ ".previous\n" + /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); return true; t_no: @@ -318,10 +318,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) _ASM_ALIGN "\n" _ASM_PTR "1b\n" _ASM_PTR "3f\n" - " .byte %P1\n" /* feature bit */ + " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */ + ".previous\n" + ".section .discard,\"aw\",@progbits\n" + " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" "3: movb $1,%0\n" @@ -337,7 +339,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ - (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \ + __builtin_constant_p(bit) ? \ __static_cpu_has(bit) : \ boot_cpu_has(bit) \ ) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf15..7862cf510ea 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -914,7 +914,7 @@ ENTRY(simd_coprocessor_error) .balign 4 .long 661b .long 663f - .byte X86_FEATURE_XMM + .word X86_FEATURE_XMM .byte 662b-661b .byte 664f-663f .previous diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ebeafcce04a..aa4326bfb24 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -52,7 +52,7 @@ ENDPROC(clear_page) .align 8 .quad clear_page .quad 1b - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lclear_page_end - clear_page .byte 2b - 1b .previous diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 727a5d46d2f..6fec2d1cebe 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -113,7 +113,7 @@ ENDPROC(copy_page) .align 8 .quad copy_page .quad 1b - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lcopy_page_end - copy_page .byte 2b - 1b .previous diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index f82e884928a..bcbcd1e0f7d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -131,7 +131,7 @@ ENDPROC(__memcpy) .align 8 .quad memcpy .quad .Lmemcpy_c - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD /* * Replace only beginning, memcpy is used to apply alternatives, diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index e88d3b81644..09d34426965 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -121,7 +121,7 @@ ENDPROC(__memset) .align 8 .quad memset .quad .Lmemset_c - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lfinal - memset .byte .Lmemset_e - .Lmemset_c .previous -- cgit v1.2.3 From bdc802dcca1709b01988d57e91f9f35ce1609fcc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 7 Jul 2010 17:29:18 -0700 Subject: x86, cpu: Support the features flags in new CPUID leaf 7 Intel has defined CPUID leaf 7 as the next set of feature flags (see the AVX specification, version 007). Add support for this new feature flags word. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cpufeature.h | 13 +++++++++---- arch/x86/include/asm/required-features.h | 2 ++ arch/x86/kernel/cpu/common.c | 10 ++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aeb6f3f9b2c..3ec9275cea4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -6,7 +6,7 @@ #include -#define NCAPINTS 9 /* N 32-bit words worth of info */ +#define NCAPINTS 10 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -159,14 +159,14 @@ /* * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc + * CPUID levels like 0x6, 0xA etc, word 7 */ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -/* Virtualization flags: Linux defined */ +/* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ @@ -177,6 +177,9 @@ #define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ #define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE (9*32+0) /* {RD/WR}{FS/GS}BASE instructions*/ + #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include @@ -197,7 +200,9 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ - (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ + (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ + (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ ? 1 : \ test_cpu_cap(c, bit)) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 64cf2d24fad..6c7fc25f2c3 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -84,5 +84,7 @@ #define REQUIRED_MASK5 0 #define REQUIRED_MASK6 0 #define REQUIRED_MASK7 0 +#define REQUIRED_MASK8 0 +#define REQUIRED_MASK9 0 #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211..c7358303d8c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + if (eax > 0) + c->x86_capability[9] = ebx; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; -- cgit v1.2.3 From 5bbd4a336c81d32df71642abf310cf3d0c98dc9b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 7 Jul 2010 19:51:59 -0400 Subject: x86/apic/es7000_32: Remove unused variable In today's linux-next I got this compile warning: arch/x86/kernel/apic/es7000_32.c:132: warning: 'base' defined but not used Current patch solves the issue removing the unused variable. Signed-off-by: Javier Martinez Canillas Cc: Rakib Mullick Cc: Eric W. Biederman Cc: Cyrill Gorcunov Cc: Tejun Heo LKML-Reference: <1278546719.9020.4.camel@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a87fe..8593582d802 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -129,7 +129,6 @@ int es7000_plat; * GSI override for ES7000 platforms. */ -static unsigned int base; static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { -- cgit v1.2.3 From 9279aa55061a280b826bdf9ba5ab5f6a566c1dfb Mon Sep 17 00:00:00 2001 From: Ky Srinivasan Date: Mon, 28 Jun 2010 08:48:55 -0600 Subject: x86: Export the symbol ms_hyperv This is needed so that the staging hyperv can properly access this symbol. Signed-off-by: K. Y. Srinivasan Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mshyperv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bbe46b..d944bf6c50e 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); static bool __init ms_hyperv_platform(void) { -- cgit v1.2.3 From ffa71f33a820d1ab3f2fc5723819ac60fb76080b Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Fri, 18 Jun 2010 12:22:40 +0900 Subject: x86, ioremap: Fix incorrect physical address handling in PAE mode Current x86 ioremap() doesn't handle physical address higher than 32-bit properly in X86_32 PAE mode. When physical address higher than 32-bit is passed to ioremap(), higher 32-bits in physical address is cleared wrongly. Due to this bug, ioremap() can map wrong address to linear address space. In my case, 64-bit MMIO region was assigned to a PCI device (ioat device) on my system. Because of the ioremap()'s bug, wrong physical address (instead of MMIO region) was mapped to linear address space. Because of this, loading ioatdma driver caused unexpected behavior (kernel panic, kernel hangup, ...). Signed-off-by: Kenji Kaneshige LKML-Reference: <4C1AE680.7090408@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 12e4d2d3c11..754cb4cbce6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, unsigned long prot_val, void *caller) { - unsigned long pfn, offset, vaddr; - resource_size_t last_addr; + unsigned long offset, vaddr; + resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; @@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (pfn = phys_addr >> PAGE_SHIFT; - (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); - pfn++) { - + last_pfn = last_addr >> PAGE_SHIFT; + for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) { int is_ram = page_is_ram(pfn); if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) @@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; + phys_addr &= PHYSICAL_PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, -- cgit v1.2.3 From 35be1b716a475717611b2dc04185e9d80b9cb693 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Fri, 18 Jun 2010 12:23:57 +0900 Subject: x86, ioremap: Fix normal ram range check Check for normal RAM in x86 ioremap() code seems to not work for the last page frame in the specified physical address range. Signed-off-by: Kenji Kaneshige LKML-Reference: <4C1AE6CD.1080704@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 754cb4cbce6..d41d3a9036c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -101,7 +101,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) { + for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { int is_ram = page_is_ram(pfn); if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) -- cgit v1.2.3 From fa97bdf92709adaaf8b9a5164a895e262a4fcf60 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 11 Jul 2010 11:06:57 +0300 Subject: x86, setup: Early-boot serial I/O support This patch adds serial I/O support to the real-mode setup (very early boot) printf(). It's useful for debugging boot code when running Linux under KVM, for example. The actual code was lifted from early printk. Cc: Cyrill Gorcunov Cc: Ingo Molnar Cc: Yinghai Lu Signed-off-by: Pekka Enberg LKML-Reference: <1278835617-11368-1-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: H. Peter Anvin --- arch/x86/boot/boot.h | 16 +++++++ arch/x86/boot/main.c | 3 ++ arch/x86/boot/string.c | 41 ++++++++++++++++++ arch/x86/boot/tty.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 165 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 98239d2658f..46c4c5c71af 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -37,6 +37,8 @@ extern struct setup_header hdr; extern struct boot_params boot_params; +#define cpu_relax() asm volatile("rep; nop") + /* Basic port I/O */ static inline void outb(u8 v, u16 port) { @@ -203,6 +205,17 @@ static inline int isdigit(int ch) return (ch >= '0') && (ch <= '9'); } +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + /* Heap -- available for dynamic lists. */ extern char _end[]; extern char *HEAP; @@ -329,10 +342,13 @@ void initregs(struct biosregs *regs); /* string.c */ int strcmp(const char *str1, const char *str2); +int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); +unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); /* tty.c */ +void console_init(void); void puts(const char *); void putchar(int); int getchar(void); diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 140172b895b..4ef1a33e857 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -130,6 +130,9 @@ void main(void) /* First, copy the boot header into the "zeropage" */ copy_boot_params(); + /* Initialize the early-boot console */ + console_init(); + /* End of heap check */ init_heap(); diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index f94b7a0c2ab..aba29df4a7b 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2) return 0; } +int strncmp(const char *cs, const char *ct, size_t count) +{ + unsigned char c1, c2; + + while (count) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + count--; + } + return 0; +} + size_t strnlen(const char *s, size_t maxlen) { const char *es = s; @@ -48,3 +64,28 @@ unsigned int atou(const char *s) i = i * 10 + (*s++ - '0'); return i; } + +/* Works only for digits and letters, but small and fast */ +#define TOLOWER(x) ((x) | 0x20) + +unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) +{ + unsigned long long result = 0; + + if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') + cp += 2; + + while (isxdigit(*cp)) { + unsigned int value; + + value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10; + if (value >= base) + break; + result = result * base + value; + cp++; + } + if (endp) + *endp = (char *)cp; + + return result; +} diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 01ec69c901c..f3ceee20ff1 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -10,23 +10,51 @@ * ----------------------------------------------------------------------- */ /* - * Very simple screen I/O - * XXX: Probably should add very simple serial I/O? + * Very simple screen and serial I/O */ #include "boot.h" +#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ + +static int early_serial_base; + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +#define DEFAULT_BAUD 9600 + /* * These functions are in .inittext so they can be used to signal * error during initialization. */ -void __attribute__((section(".inittext"))) putchar(int ch) +static void __attribute__((section(".inittext"))) serial_putchar(int ch) { - struct biosregs ireg; + unsigned timeout = 0xffff; - if (ch == '\n') - putchar('\r'); /* \n -> \r\n */ + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + + outb(ch, early_serial_base + TXR); +} + +static void __attribute__((section(".inittext"))) bios_putchar(int ch) +{ + struct biosregs ireg; initregs(&ireg); ireg.bx = 0x0007; @@ -36,6 +64,17 @@ void __attribute__((section(".inittext"))) putchar(int ch) intcall(0x10, &ireg, NULL); } +void __attribute__((section(".inittext"))) putchar(int ch) +{ + if (ch == '\n') + putchar('\r'); /* \n -> \r\n */ + + bios_putchar(ch); + + if (early_serial_base != 0) + serial_putchar(ch); +} + void __attribute__((section(".inittext"))) puts(const char *str) { while (*str) @@ -112,3 +151,63 @@ int getchar_timeout(void) return 0; /* Timeout! */ } + +static void early_serial_init(int baud) +{ + unsigned char c; + unsigned divisor; + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +void console_init(void) +{ + int baud = DEFAULT_BAUD; + char arg[32]; + int pos = 0; + + if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { + char *e; + + if (!strncmp(arg, "serial", 6)) { + early_serial_base = DEFAULT_SERIAL_PORT; + pos += 6; + } + + if (arg[pos] == ',') + pos++; + + if (!strncmp(arg, "ttyS", 4)) { + static const int bases[] = { 0x3f8, 0x2f8 }; + int port = 0; + + if (!strncmp(arg + pos, "ttyS", 4)) + pos += 4; + + if (arg[pos++] == '1') + port = 1; + + early_serial_base = bases[port]; + } + + if (arg[pos] == ',') + pos++; + + baud = simple_strtoull(arg + pos, &e, 0); + if (baud == 0 || arg + pos == e) + baud = DEFAULT_BAUD; + } + + if (early_serial_base != 0) + early_serial_init(baud); +} -- cgit v1.2.3 From ce0aa5dd20e44372f9617dd67c984f41fcdbed88 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 13 Jul 2010 13:35:17 -0700 Subject: x86, setup: Make the setup code also accept console=uart8250 Make the boot code also accept the console=uart8250,io,0x2f8,115200n form of early console. Also add back simple_guess_base(), otherwise those simple_strtoull(,,0) are not going to work. Signed-off-by: Yinghai Lu LKML-Reference: <4C3CCE05.4090505@kernel.org> Acked-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/boot/string.c | 22 +++++++++++++++++++ arch/x86/boot/tty.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index aba29df4a7b..3cbc4058dd2 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -68,10 +68,32 @@ unsigned int atou(const char *s) /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) +static unsigned int simple_guess_base(const char *cp) +{ + if (cp[0] == '0') { + if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2])) + return 16; + else + return 8; + } else { + return 10; + } +} + +/** + * simple_strtoull - convert a string to an unsigned long long + * @cp: The start of the string + * @endp: A pointer to the end of the parsed string will be placed here + * @base: The number base to use + */ + unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { unsigned long long result = 0; + if (!base) + base = simple_guess_base(cp); + if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') cp += 2; diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index f3ceee20ff1..f6d52e65f97 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -170,7 +170,7 @@ static void early_serial_init(int baud) outb(c & ~DLAB, early_serial_base + LCR); } -void console_init(void) +static int parse_earlyprintk(void) { int baud = DEFAULT_BAUD; char arg[32]; @@ -208,6 +208,63 @@ void console_init(void) baud = DEFAULT_BAUD; } + return baud; +} + +#define BASE_BAUD (1843200/16) +static unsigned int probe_baud(int port) +{ + unsigned char lcr, dll, dlh; + unsigned int quot; + + lcr = inb(port + LCR); + outb(lcr | DLAB, port + LCR); + dll = inb(port + DLL); + dlh = inb(port + DLH); + outb(lcr, port + LCR); + quot = (dlh << 8) | dll; + + return BASE_BAUD / quot; +} + +static int parse_console_uart8250(void) +{ + char optstr[64], *options; + int baud = DEFAULT_BAUD; + + /* + * console=uart8250,io,0x3f8,115200n8 + * need to make sure it is last one console ! + */ + if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) + return baud; + + options = optstr; + + if (!strncmp(options, "uart8250,io,", 12)) + early_serial_base = simple_strtoull(options + 12, &options, 0); + else if (!strncmp(options, "uart,io,", 8)) + early_serial_base = simple_strtoull(options + 8, &options, 0); + else + return baud; + + if (options && (options[0] == ',')) + baud = simple_strtoull(options + 1, &options, 0); + else + baud = probe_baud(early_serial_base); + + return baud; +} + +void console_init(void) +{ + int baud; + + baud = parse_earlyprintk(); + + if (!early_serial_base) + baud = parse_console_uart8250(); + if (early_serial_base != 0) early_serial_init(baud); } -- cgit v1.2.3 From df378ccfc4dd04e263426ad805516915874774aa Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Jul 2010 14:55:11 -0700 Subject: x86, alternatives: Fix one more open-coded 8-bit alternative number Fix a missing case of an 8-bit alternative number, buried inside an assembly macro. Signed-off-by: H. Peter Anvin Reported-by: Yinghai Lu Cc: Suresh Siddha LKML-Reference: <4C3BDDA3.2060900@kernel.org> --- arch/x86/lib/copy_user_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 71100c98e33..a460158b5ac 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -29,7 +29,7 @@ .align 8 .quad 0b .quad 2b - .byte \feature /* when feature is set */ + .word \feature /* when feature is set */ .byte 5 .byte 5 .previous -- cgit v1.2.3 From 3b770a2128423a687e6e9c57184a584fb4ba4c77 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Jul 2010 14:57:50 -0700 Subject: x86, alternatives: BUG on encountering an invalid CPU feature number Make the alternatives-patching code BUG on encountering an invalid CPU feature number. Should have done this a long time ago. Signed-off-by: H. Peter Anvin Cc: Yinghai Lu Cc: Suresh Siddha LKML-Reference: --- arch/x86/kernel/alternative.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 70237732a6c..f65ab8b014c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 70b0d22d581a5deef7b2876b0c3774635b8d846c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 14 Jul 2010 11:26:57 -0700 Subject: x86, setup: Only set early_serial_base after port is initialized putchar is using early_serial_base to check if port is initialized. So we only assign it after early_serial_init() is called, in case we need use VGA to debug early serial console. Also add display for port addr and baud. -v2: update to current tip Acked-by: Pekka Enberg Signed-off-by: Yinghai Lu LKML-Reference: <4C3E0171.6050008@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/boot/tty.c | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index f6d52e65f97..ff4b27a0fc5 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -152,35 +152,40 @@ int getchar_timeout(void) return 0; /* Timeout! */ } -static void early_serial_init(int baud) +static void early_serial_init(int port, int baud) { unsigned char c; unsigned divisor; - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + outb(0x3, port + LCR); /* 8n1 */ + outb(0, port + IER); /* no interrupt */ + outb(0, port + FCR); /* no fifo */ + outb(0x3, port + MCR); /* DTR + RTS */ divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); + c = inb(port + LCR); + outb(c | DLAB, port + LCR); + outb(divisor & 0xff, port + DLL); + outb((divisor >> 8) & 0xff, port + DLH); + outb(c & ~DLAB, port + LCR); + + early_serial_base = port; + + printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud); } -static int parse_earlyprintk(void) +static void parse_earlyprintk(void) { int baud = DEFAULT_BAUD; char arg[32]; int pos = 0; + int port = 0; if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { char *e; if (!strncmp(arg, "serial", 6)) { - early_serial_base = DEFAULT_SERIAL_PORT; + port = DEFAULT_SERIAL_PORT; pos += 6; } @@ -189,15 +194,15 @@ static int parse_earlyprintk(void) if (!strncmp(arg, "ttyS", 4)) { static const int bases[] = { 0x3f8, 0x2f8 }; - int port = 0; + int idx = 0; if (!strncmp(arg + pos, "ttyS", 4)) pos += 4; if (arg[pos++] == '1') - port = 1; + idx = 1; - early_serial_base = bases[port]; + port = bases[idx]; } if (arg[pos] == ',') @@ -208,7 +213,8 @@ static int parse_earlyprintk(void) baud = DEFAULT_BAUD; } - return baud; + if (port) + early_serial_init(port, baud); } #define BASE_BAUD (1843200/16) @@ -227,44 +233,41 @@ static unsigned int probe_baud(int port) return BASE_BAUD / quot; } -static int parse_console_uart8250(void) +static void parse_console_uart8250(void) { char optstr[64], *options; int baud = DEFAULT_BAUD; + int port = 0; /* * console=uart8250,io,0x3f8,115200n8 * need to make sure it is last one console ! */ if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) - return baud; + return; options = optstr; if (!strncmp(options, "uart8250,io,", 12)) - early_serial_base = simple_strtoull(options + 12, &options, 0); + port = simple_strtoull(options + 12, &options, 0); else if (!strncmp(options, "uart,io,", 8)) - early_serial_base = simple_strtoull(options + 8, &options, 0); + port = simple_strtoull(options + 8, &options, 0); else - return baud; + return; if (options && (options[0] == ',')) baud = simple_strtoull(options + 1, &options, 0); else - baud = probe_baud(early_serial_base); + baud = probe_baud(port); - return baud; + if (port) + early_serial_init(port, baud); } void console_init(void) { - int baud; - - baud = parse_earlyprintk(); + parse_earlyprintk(); if (!early_serial_base) - baud = parse_console_uart8250(); - - if (early_serial_base != 0) - early_serial_init(baud); + parse_console_uart8250(); } -- cgit v1.2.3 From b2691085d1f3ccce641dcfdd02722ba5d34db6ba Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 28 Jun 2010 16:46:48 -0700 Subject: x86: Clean up arch/x86/kernel/cpu/mtrr/cleanup.c: use ";" not "," to terminate statements Also needed if pr_ becomes a bit more space efficient. Signed-off-by: Joe Perches Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" LKML-Reference: <1277768808.29157.280.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f01..c5f59d07142 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) unsigned long gran_base, chunk_base, lose_base; char gran_factor, chunk_factor, lose_factor; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", result[i].bad ? "*BAD*" : " ", -- cgit v1.2.3 From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 6 May 2010 20:17:00 +0200 Subject: unistd: add __NR_prlimit64 syscall numbers Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to asm-x86, both 32 and 64-bit. Signed-off-by: Jiri Slaby Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa..a88e31d1836 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -842,4 +842,5 @@ ia32_sys_call_table: .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open .quad compat_sys_recvmmsg + .quad sys_prlimit64 ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a..35e0cb151b6 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -343,10 +343,11 @@ #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 +#define __NR_prlimit64 338 #ifdef __KERNEL__ -#define NR_syscalls 338 +#define NR_syscalls 339 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81..570bf5eae56 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 299 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_prlimit64 300 +__SYSCALL(__NR_prlimit64, sys_prlimit64) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b372934121..eca1d7d23ab 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_prlimit64 -- cgit v1.2.3 From 93a7ca0c3ebe5d931126f1fb732cb9c4518383d4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 16 Jul 2010 10:11:21 -0500 Subject: x86, UV: Initialize BAU MMRs only on hubs with cpus Remove the initialization of MMRs UVH_LB_BAU_SB_ACTIVATION_CONTROL and UVH_BAU_DATA_BROADCAST on UV hubs that have no active cpus. Such initialization on hubs with no active cpus would result in a kernel page fault. This is not of real high priority, because we don't have any such systems (with UV hubs that have no active cpus). But they will be coming. Signed-off-by: Cliff Wickman LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index abf3c31f14c..59efb5390b3 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1635,12 +1635,16 @@ static int __init uv_bau_init(void) alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); - mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); + if (uv_blade_nr_possible_cpus(uvhub)) { + pnode = uv_blade_to_pnode(uvhub); + /* INIT the bau */ + uv_write_global_mmr64(pnode, + UVH_LB_BAU_SB_ACTIVATION_CONTROL, + ((unsigned long)1 << 63)); + mmr = 1; /* should be 1 to broadcast to both sockets */ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, + mmr); + } } return 0; -- cgit v1.2.3 From a2531293dbb7608fa672ff28efe3ab4027917a2f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 18 Jul 2010 14:27:13 +0200 Subject: update email address pavel@suse.cz no longer works, replace it with working address. Signed-off-by: Pavel Machek Signed-off-by: Jiri Kosina --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/power/cpu.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b9..f51cc55aced 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -2,7 +2,7 @@ * sleep.c - x86-specific ACPI sleep support. * * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek + * Copyright (C) 2001-2003 Pavel Machek */ #include diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c4f9182ca3a..4c9c67bf09b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -140,7 +140,7 @@ * is now the way life works). * Fix thinko in suspend() (wrong return). * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek + * Make kapmd absorb more idle time (Pavel Machek * modified by sfr). * Disable interrupts while we are suspended (Andy Henroid * fixed by sfr). diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e..0af9aa20fce 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -9,7 +9,7 @@ * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski - * (C) 2004 Pavel Machek + * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee41bba315d..9a6674689a2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -2,7 +2,7 @@ * linux/arch/x86_64/mm/init.c * * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Pavel Machek * Copyright (C) 2002,2003 Andi Kleen */ diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1290ba54b35..e7e8c5f5495 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -4,7 +4,7 @@ * Distribute under GPLv2 * * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index d24f983ba1e..460f314d13e 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -4,7 +4,7 @@ * Distribute under GPLv2 * * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ -- cgit v1.2.3 From 6c54aabd5e687092557f4881ce2d4013b971f293 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 12:03:51 -0400 Subject: x86/amd-iommu: Use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c..29dd3b9f2f0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2609,8 +2609,7 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - + for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; -- cgit v1.2.3 From edb18f8ab02843453306601c4aa697f9691129cd Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:50 -0700 Subject: x86, cpu: Make init_scattered_cpuid_features() consider cpuid subleaves Some cpuid features (like xsaveopt) are enumerated using cpuid subleaves. Extend init_scattered_cpuid_features() to take subleaf into account. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.439900717@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 7369b4c2c55..03cf24a3d93 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -14,6 +14,7 @@ struct cpuid_bit { u8 reg; u8 bit; u32 level; + u32 sub_leaf; }; enum cpuid_regs { @@ -30,16 +31,16 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, - { 0, 0, 0, 0 } + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } }; for (cb = cpuid_bits; cb->feature; cb++) { @@ -50,8 +51,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); -- cgit v1.2.3 From 40e1d7a4ffee5cb17f5c36f4c3c4a011ab103ebe Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:51 -0700 Subject: x86, cpu: Add xsaveopt cpufeature Add cpu feature bit support for the XSAVEOPT instruction. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.523204988@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3ec9275cea4..d5ea3e3a8a4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -165,6 +165,7 @@ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_XSAVEOPT (7*32+4) /* "xsaveopt" Optimized Xsave */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ -- cgit v1.2.3 From 5734f62b6601d88fd8ec720cb56b93fd3a030557 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, cpu: Enumerate xsaveopt Enumerate the xsaveopt feature. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 03cf24a3d93..41eebcd90fc 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -35,6 +35,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, -- cgit v1.2.3 From a1488f8bf4d72ad724700f6e982469a1240e4264 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:48 -0700 Subject: x86, xsave: Track the offset, size of state in the xsave layout Subleaves of the cpuid vector 0xd provides the offset and size of different feature state that are managed by the xsave/xrstor. Track this for the upcoming usage during signal handling. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.262987929@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 980149867a1..4993caa4181 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -21,6 +21,8 @@ struct _fpx_sw_bytes fx_sw_reserved; struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -301,6 +303,31 @@ void __cpuinit xsave_init(void) xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + /* * setup the xstate image representing the init state */ @@ -308,6 +335,8 @@ static void __init setup_xstate_init(void) { init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + setup_xstate_features(); } /* -- cgit v1.2.3 From 29104e101d710dd152f807978884643a52eca8b7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:49 -0700 Subject: x86, xsave: Sync xsave memory layout with its header for user handling With xsaveopt, if a processor implementation discern that a processor state component is in its initialized state it may modify the corresponding bit in the xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory layout. Hence wHile presenting the xstate information to the user, we always ensure that the memory layout of a feature will be in the init state if the corresponding header bit is zero. This ensures the consistency and avoids the condition of the user seeing some some stale state in the memory layout during signal handling, debugging etc. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.351459480@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 14 +++++++ arch/x86/include/asm/xsave.h | 10 +++++ arch/x86/kernel/i387.c | 11 ++++++ arch/x86/kernel/xsave.c | 89 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 123 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b90..bb370fd0a1c 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -58,11 +58,25 @@ extern int restore_i387_xstate_ia32(void __user *buf); #define X87_FSW_ES (1 << 7) /* Exception Summary */ +static __always_inline __pure bool use_xsaveopt(void) +{ + return 0; +} + static __always_inline __pure bool use_xsave(void) { return static_cpu_has(X86_FEATURE_XSAVE); } +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + #ifdef CONFIG_X86_64 /* Ignore delayed exceptions from user space */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae22..0c72adc0cb1 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -111,6 +111,16 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask) : "memory"); } +static inline void xsave_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + static inline void fpu_xsave(struct fpu *fpu) { /* This, however, we can work around by forcing the compiler to select diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b3225..6106af9fd12 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -190,6 +190,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); } @@ -207,6 +209,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); @@ -446,6 +450,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -467,6 +473,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); @@ -533,6 +541,9 @@ static int save_i387_xsave(void __user *buf) struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4993caa4181..368047c8d50 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -23,6 +23,76 @@ struct _fpx_sw_bytes fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + + if (!fx) + return; + + BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -112,6 +182,7 @@ int save_i387_xstate(void __user *buf) task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else { + sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, xstate_size)) return -1; @@ -333,10 +404,26 @@ static void setup_xstate_features(void) */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; - setup_xstate_features(); + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* -- cgit v1.2.3 From 6bad06b768920e278c7cedfdda56a0b4c6a35ee9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, xsave: Use xsaveopt in context-switch path when supported xsaveopt is a more optimized form of xsave specifically designed for the context switch usage. xsaveopt doesn't save the state that's not modified from the prior xrstor. And if a specific feature state gets modified to the init state, then xsaveopt just updates the header bit in the xsave memory layout without updating the corresponding memory layout. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 2 +- arch/x86/include/asm/xsave.h | 9 ++++++--- arch/x86/kernel/cpu/common.c | 8 ++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index bb370fd0a1c..59bd93ac7fe 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -60,7 +60,7 @@ extern int restore_i387_xstate_ia32(void __user *buf); static __always_inline __pure bool use_xsaveopt(void) { - return 0; + return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 0c72adc0cb1..ec86c5fd6a6 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -125,8 +125,11 @@ static inline void fpu_xsave(struct fpu *fpu) { /* This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ - __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" - : : "D" (&(fpu->state->xsave)), - "a" (-1), "d"(-1) : "memory"); + alternative_input( + ".byte " REX_PREFIX "0x0f,0xae,0x27", + ".byte " REX_PREFIX "0x0f,0xae,0x37", + X86_FEATURE_XSAVEOPT, + [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) : + "memory"); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c7358303d8c..3f715efc594 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -- cgit v1.2.3 From 278bc5f6abd69dd868746dbd642266ac09a9c9c6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 19 Jul 2010 18:53:51 -0700 Subject: x86, cpu: Clean up formatting in cpufeature.h, remove override Clean up the formatting in cpufeature.h, and remove an unnecessary name override. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha LKML-Reference: --- arch/x86/include/asm/cpufeature.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d5ea3e3a8a4..4be50ddd4d7 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -165,7 +165,7 @@ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_XSAVEOPT (7*32+4) /* "xsaveopt" Optimized Xsave */ +#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ @@ -173,13 +173,13 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ -#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ -#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ -#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE (9*32+0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3 From 2decb194e65ab66eaf787512dc572cdc99893b24 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 19 Jul 2010 18:32:04 -0700 Subject: x86, cpu: Split addon_cpuid_features.c addon_cpuid_features.c contains exactly two almost completely unrelated functions, plus has a long and very generic name. Split it into two files, scattered.c for the scattered feature flags, and topology.c for the topology information. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/addon_cpuid_features.c | 150 ----------------------------- arch/x86/kernel/cpu/scattered.c | 61 ++++++++++++ arch/x86/kernel/cpu/topology.c | 99 +++++++++++++++++++ 4 files changed, 161 insertions(+), 151 deletions(-) delete mode 100644 arch/x86/kernel/cpu/addon_cpuid_features.c create mode 100644 arch/x86/kernel/cpu/scattered.c create mode 100644 arch/x86/kernel/cpu/topology.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6..5e3a3512ba0 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,7 +12,7 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c deleted file mode 100644 index 41eebcd90fc..00000000000 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Routines to indentify additional cpu features that are scattered in - * cpuid space. - */ -#include - -#include -#include - -#include - -struct cpuid_bit { - u16 feature; - u8 reg; - u8 bit; - u32 level; - u32 sub_leaf; -}; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) -{ - u32 max_level; - u32 regs[4]; - const struct cpuid_bit *cb; - - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, - { 0, 0, 0, 0, 0 } - }; - - for (cb = cpuid_bits; cb->feature; cb++) { - - /* Verify that the level is valid */ - max_level = cpuid_eax(cb->level & 0xffff0000); - if (max_level < cb->level || - max_level > (cb->level | 0xffff)) - continue; - - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); - - if (regs[cb->reg] & (1 << cb->bit)) - set_cpu_cap(c, cb->feature); - } -} - -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 - -/* leaf 0xb sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 - -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) - -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; - - if (c->cpuid_level < 0xb) - return; - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - - /* - * check if the cpuid leaf 0xb is actually implemented. - */ - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return; - - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); - - /* - * initial apic id, which also represents 32-bit extended x2apic id. - */ - c->initial_apicid = edx; - - /* - * Populate HT related information from sub-leaf level 0. - */ - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - - /* - * Check for the Core type in the implemented sub leaves. - */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; - - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) - & core_select_mask; - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); - /* - * Reinit the apicid, now that we have extended initial_apicid. - */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); - - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } - return; -#endif -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 00000000000..9815364b477 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,61 @@ +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ +#include + +#include +#include + +#include + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 00000000000..4397e987a1c --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,99 @@ +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include +#include +#include +#include + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + static bool printed; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } + return; +#endif +} -- cgit v1.2.3 From 093d7b4639951ea3021a6f70d376c3ff31f4740c Mon Sep 17 00:00:00 2001 From: Miroslav Rezanina Date: Wed, 16 Sep 2009 03:56:17 -0400 Subject: xen: release unused free memory Scan an e820 table and release any memory which lies between e820 entries, as it won't be used and would just be wasted. At present this is just to release any memory beyond the end of the e820 map, but it will also deal with holes being punched in the map. Derived from patch by Miroslav Rezanina Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 1 - arch/x86/xen/setup.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a..399bed2de88 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -731,7 +731,6 @@ static void set_xen_basic_apic_ops(void) #endif - static void xen_clts(void) { struct multicall_space mcs; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd..e0942630d47 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "xen-ops.h" @@ -32,6 +33,56 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long start, end; + unsigned long len; + unsigned long pfn; + int ret; + + start = PFN_UP(start_addr); + end = PFN_UP(end_addr); + + if (end <= start) + return 0; + + len = end - start; + + set_xen_guest_handle(reservation.extent_start, &mfn_list[start]); + reservation.nr_extents = len; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + + for(pfn = start; pfn < end; pfn++) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + + return len; +} + +static unsigned long __init xen_return_unused_memory(const struct e820map *e820) +{ + unsigned long last_end = 0; + unsigned long released = 0; + int i; + + for (i = 0; i < e820->nr_map; i++) { + released += xen_release_chunk(last_end, e820->map[i].addr); + last_end = e820->map[i].addr + e820->map[i].size; + } + + released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages)); + + printk(KERN_INFO "released %ld pages of unused memory\n", released); + return released; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. @@ -67,6 +118,8 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + xen_return_unused_memory(&e820); + return "Xen"; } -- cgit v1.2.3 From f89e048e76da7ac0b4c89e75606ca7a3422886b1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 16 Sep 2009 12:38:33 -0700 Subject: xen: make sure pages are really part of domain before freeing Scan the set of pages we're freeing and make sure they're actually owned by the domain before freeing. This generally won't happen on a domU (since Xen gives us contigious memory), but it could happen if there are some hardware mappings passed through. We only bother going up to the highest page Xen actually claimed to give us, since there's definitely nothing of ours above that. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 59 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0942630d47..9deb6bab6c7 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,52 +33,69 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); -static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, + phys_addr_t end_addr) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long start, end; - unsigned long len; + unsigned long len = 0; unsigned long pfn; int ret; start = PFN_UP(start_addr); - end = PFN_UP(end_addr); + end = PFN_DOWN(end_addr); if (end <= start) return 0; - len = end - start; - - set_xen_guest_handle(reservation.extent_start, &mfn_list[start]); - reservation.nr_extents = len; - - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n", - start, end, ret); - - for(pfn = start; pfn < end; pfn++) - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", + start, end); + for(pfn = start; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + if (ret == 1) { + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + len++; + } + } + printk(KERN_CONT "%ld pages freed\n", len); return len; } -static unsigned long __init xen_return_unused_memory(const struct e820map *e820) +static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, + const struct e820map *e820) { - unsigned long last_end = 0; + phys_addr_t max_addr = PFN_PHYS(max_pfn); + phys_addr_t last_end = 0; unsigned long released = 0; int i; - for (i = 0; i < e820->nr_map; i++) { - released += xen_release_chunk(last_end, e820->map[i].addr); + for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { + phys_addr_t end = e820->map[i].addr; + end = min(max_addr, end); + + released += xen_release_chunk(last_end, end); last_end = e820->map[i].addr + e820->map[i].size; } - released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages)); + if (last_end < max_addr) + released += xen_release_chunk(last_end, max_addr); printk(KERN_INFO "released %ld pages of unused memory\n", released); return released; @@ -118,7 +135,7 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - xen_return_unused_memory(&e820); + xen_return_unused_memory(xen_start_info->nr_pages, &e820); return "Xen"; } -- cgit v1.2.3 From 5f755293ca61520b70b11afe1b1d6e1635cb6c00 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Jul 2010 15:19:48 -0700 Subject: x86, gcc-4.6: Avoid unused by set variables in rdmsr Avoids quite a lot of warnings with a gcc 4.6 -Wall build because this happens in a commonly used header file (apic.h) Signed-off-by: Andi Kleen LKML-Reference: <201007202219.o6KMJme6021066@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c5bc4c2d33f..084ef95274c 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter) #define rdmsr(msr, val1, val2) \ do { \ u64 __val = native_read_msr((msr)); \ - (val1) = (u32)__val; \ - (val2) = (u32)(__val >> 32); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ } while (0) static inline void wrmsr(unsigned msr, unsigned low, unsigned high) -- cgit v1.2.3 From fa10ba64ac94fec4611b79804023eb087862ffe0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Jul 2010 15:19:49 -0700 Subject: x86, gcc-4.6: Fix set but not read variables Just some dead code, no real bugs. Found by gcc 4.6 -Wall Signed-off-by: Andi Kleen LKML-Reference: <201007202219.o6KMJnQ0021072@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf23..a2e0caf26e1 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - u32 agp_aper_base = 0, agp_aper_order = 0; + u32 agp_aper_order = 0; int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; @@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ - agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61..7d28d7d0388 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; - int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ - cpu = get_cpu(); + get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); -- cgit v1.2.3 From 7aa2b5f8ec60505160df1c25398e8286c8432689 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:48 +0200 Subject: x86, xsave: Do not include asm/i387.h in asm/xsave.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no dependencies to asm/i387.h. Instead, if including only xsave.h the following error occurs: .../arch/x86/include/asm/i387.h:110: error: ‘XSTATE_FP’ undeclared (first use in this function) .../arch/x86/include/asm/i387.h:110: error: (Each undeclared identifier is reported only once .../arch/x86/include/asm/i387.h:110: error: for each function it appears in.) This patch fixes this. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-2-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index ec86c5fd6a6..94d5f84d89f 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -3,7 +3,6 @@ #include #include -#include #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 -- cgit v1.2.3 From db10db48b2c530def21bfd76d576702c7df7f620 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:49 +0200 Subject: x86, xsave: 32/64 bit boot cpu check unification in initialization Boot cpu id is always 0, thus simplifying and unifying boot cpu check. boot_cpu_id is there for historical reasons and was renamed to boot_cpu_physical_apicid in patch: c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid However, there are some remaining occurrences of boot_cpu_id that are never touched in the kernel and thus its value is always 0. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f715efc594..26804b2986b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1273,7 +1273,7 @@ void __cpuinit cpu_init(void) /* * Boot processor to setup the FP and extended state context info. */ - if (smp_processor_id() == boot_cpu_id) + if (!smp_processor_id()) init_thread_xstate(); xsave_init(); -- cgit v1.2.3 From 82d4150cec83b9775f84810b39a1c0b91585d429 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:51 +0200 Subject: x86, xsave: Move boot cpu initialization to xsave_init() This patch moves boot cpu initialization to xsave_init(). Now all cpus are initialized in one single function. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 6 ------ arch/x86/kernel/i387.c | 5 ----- arch/x86/kernel/xsave.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 26804b2986b..40561085d4f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1270,12 +1270,6 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 6106af9fd12..2f32ef05f10 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -93,11 +93,6 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); xsave_init(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 368047c8d50..ab9ad48b653 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,7 +360,7 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -void __cpuinit xsave_init(void) +static void __cpuinit __xsave_init(void) { if (!cpu_has_xsave) return; @@ -446,7 +446,7 @@ void __ref xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xsave_init(); + __xsave_init(); /* * Recompute the context size for enabled features @@ -463,3 +463,13 @@ void __ref xsave_cntxt_init(void) "cntxt size 0x%x\n", pcntxt_mask, xstate_size); } + +void __cpuinit xsave_init(void) +{ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + __xsave_init(); +} -- cgit v1.2.3 From 92851e2fca48f1893f899963c13b55b61ac6956c Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 20 Jul 2010 15:19:46 -0700 Subject: x86, mm: Create symbolic index into address_markers array Without this, adding entries into the address_markers array means adding more and more of an #ifdef maze in pt_dump_init(). By using indices, we can keep it a bit saner. Signed-off-by: Andres Salomon LKML-Reference: <201007202219.o6KMJkUs021052@imap1.linux-foundation.org> Cc: Jordan Crouse Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/dump_pagetables.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a725b7f760a..0002a3a3308 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -37,6 +37,28 @@ struct addr_marker { const char *name; }; +/* indices for address_markers; keep sync'd w/ address_markers below */ +enum address_markers_idx { + USER_SPACE_NR = 0, +#ifdef CONFIG_X86_64 + KERNEL_SPACE_NR, + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +#else + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +# ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +# endif + FIXADDR_START_NR, +#endif +}; + /* Address space markers hints */ static struct addr_marker address_markers[] = { { 0, "User Space" }, @@ -331,14 +353,12 @@ static int pt_dump_init(void) #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ - address_markers[2].start_address = VMALLOC_START; - address_markers[3].start_address = VMALLOC_END; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; # ifdef CONFIG_HIGHMEM - address_markers[4].start_address = PKMAP_BASE; - address_markers[5].start_address = FIXADDR_START; -# else - address_markers[4].start_address = FIXADDR_START; + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; # endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, -- cgit v1.2.3 From 468c30f2bbdf1ba0fbf16667eade23a46eaa8f06 Mon Sep 17 00:00:00 2001 From: Florian Zumbiehl Date: Tue, 20 Jul 2010 15:19:47 -0700 Subject: x86, iomap: Fix wrong page aligned size calculation in ioremapping code x86 early_iounmap(): fix off-by-one error in page alignment of allocation size for sizes where size%PAGE_SIZE==1. Signed-off-by: Florian Zumbiehl LKML-Reference: <201007202219.o6KMJlES021058@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d41d3a9036c..3ba6e0608c5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -611,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) return; } offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { -- cgit v1.2.3 From a751bd858b16dce57f3b6b85ba07946df1bd7be4 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 20 Jul 2010 15:19:45 -0700 Subject: x86, rwsem: Stay on fast path when count > 0 in __up_write() When count > 0 there is no need to take the call_rwsem_wake path. If we did take that path, it would just return without doing anything due to the active count not being zero. Signed-off-by: Michel Lespinasse LKML-Reference: <201007202219.o6KMJj9x021042@imap1.linux-foundation.org> Acked-by: David Howells Cc: Mike Waychison Cc: Suleiman Souhlal Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 606ede12697..5bf5e04e497 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -216,9 +216,8 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" - /* tries to transition - 0xffff0001 -> 0x00000000 */ - " jz 1f\n" + /* subtracts 0xffff0001, returns the old value */ + " jns 1f\n\t" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" -- cgit v1.2.3 From b4bcb4c28c64cc2876b4aef218d992ce806194da Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 20 Jul 2010 15:19:45 -0700 Subject: x86, rwsem: Minor cleanups Clarified few comments and made initialization of %edx/%rdx more uniform accross __down_write_nested, __up_read and __up_write functions. Signed-off-by: Michel Lespinasse LKML-Reference: <201007202219.o6KMJkiA021048@imap1.linux-foundation.org> Acked-by: David Howells Cc: Mike Waychison Cc: Suleiman Souhlal Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 5bf5e04e497..d1e41b0f9b6 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" LOCK_PREFIX _ASM_INC "(%1)\n\t" - /* adds 0x00000001, returns the old value */ + /* adds 0x00000001 */ " jns 1f\n" " call call_rwsem_down_read_failed\n" "1:\n\t" @@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { rwsem_count_t tmp; - - tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" - /* subtract 0x0000ffff, returns the old value */ + /* adds 0xffff0001, returns the old value */ " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" @@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) "1:\n" "# ending down_write" : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); } @@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" - " call call_rwsem_wake\n" + " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n" "# ending __up_read\n" : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); } @@ -218,7 +216,7 @@ static inline void __up_write(struct rw_semaphore *sem) LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 0xffff0001, returns the old value */ " jns 1f\n\t" - " call call_rwsem_wake\n" + " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n\t" "# ending __up_write\n" : "+m" (sem->count), "=d" (tmp) -- cgit v1.2.3 From 5edd19af18a36a4e22c570b1b969179e0ca1fe4c Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 20 Jul 2010 18:09:05 -0500 Subject: x86, UV: Make kdump avoid stack dumps UV NMI callback's should not write stack dumps when a kdump is to be written. When invoking the crash kernel to write a dump, kdump_nmi_shootdown_cpus() uses NMI's to get all the cpu's to save their register context and halt. But the NMI interrupt handler runs a callback list. This patch sets a flag to prevent any of those callbacks from interfering with the halt of the cpu. For UV, which currently has the only callback to which this is relevant, the uv_handle_nmi() callback should not do dumping of stacks. The 'in_crash_kexec' flag is defined as an extern in kdebug.h firstly because x2apic_uv_x.c includes it. Secondly because some future callback might need the flag to know that it should not enter the debugger. (Such a scenario was in fact present in the 2.6.32 kernel, SuSE distribution, where a call to kdb needed to be avoided.) Signed-off-by: Cliff Wickman LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/kdebug.h | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++++ arch/x86/kernel/crash.c | 3 +++ 3 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index fa7c0b97476..7a2910b3512 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -33,5 +33,6 @@ extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); +extern int in_crash_kexec; #endif /* _ASM_X86_KDEBUG_H */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e46f98f36e3..7b598b84c90 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { if (reason != DIE_NMI_IPI) return NOTIFY_OK; + + if (in_crash_kexec) + /* do nothing if entering the crash kernel */ + return NOTIFY_OK; /* * Use a lock so only one cpu prints at a time * to prevent intermixed output. diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ebd4c51d096..764c7c2b181 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,6 +28,8 @@ #include #include +int in_crash_kexec; + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct die_args *args) @@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) static void kdump_nmi_shootdown_cpus(void) { + in_crash_kexec = 1; nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); -- cgit v1.2.3 From 3f8afb77cd8a672f024e4a16763ef177bc16c8f8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 21 Jul 2010 14:47:05 +0200 Subject: x86, tlb: Clean up and correct used type smp_processor_id() returns an int and not an unsigned long. Also, since the function is small enough, there's no need for a local variable caching its value. No functionality change, just cleanup. Signed-off-by: Borislav Petkov LKML-Reference: <20100721124705.GA674@aftab> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 426f3a1a64d..c03f14ab666 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) static void do_flush_tlb_all(void *info) { - unsigned long cpu = smp_processor_id(); - __flush_tlb_all(); if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(cpu); + leave_mm(smp_processor_id()); } void flush_tlb_all(void) -- cgit v1.2.3 From 0e49bf66d2ca649b167428adddbbbe9d9bd4894c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:52 +0200 Subject: x86, xsave: Separate fpu and xsave initialization As xsave also supports other than fpu features, it should be initialized independently of the fpu. This patch moves this out of fpu initialization. There is also a lot of cross referencing between fpu and xsave code. This patch reduces this by making xsave_cntxt_init() and init_thread_xstate() static functions. The patch moves the cpu_has_xsave check at the beginning of xsave_init(). All other checks may removed then. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-2-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 1 - arch/x86/include/asm/xsave.h | 1 - arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/i387.c | 27 +++++++++++++++++++-------- arch/x86/kernel/xsave.c | 10 +++++----- 5 files changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 59bd93ac7fe..509ddabeae2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); extern void __math_state_restore(void); -extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 94d5f84d89f..4d3b5d1fc02 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -28,7 +28,6 @@ extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern void xsave_cntxt_init(void); extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 40561085d4f..94c36c7ac18 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1210,6 +1210,7 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); + xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1270,6 +1271,7 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); + fpu_init(); xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2f32ef05f10..e73c54ebafc 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) stts(); } -void __cpuinit init_thread_xstate(void) +static void __cpuinit init_thread_xstate(void) { + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + if (!HAVE_HWFP) { xstate_size = sizeof(struct i387_soft_struct); return; } - if (cpu_has_xsave) { - xsave_cntxt_init(); - return; - } - if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ + void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); @@ -93,14 +94,24 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - xsave_init(); + if (!smp_processor_id()) + init_thread_xstate(); mxcsr_feature_mask_init(); /* clean state in init */ current_thread_info()->status = 0; clear_used_math(); } -#endif /* CONFIG_X86_64 */ + +#else /* CONFIG_X86_64 */ + +void __cpuinit fpu_init(void) +{ + if (!smp_processor_id()) + init_thread_xstate(); +} + +#endif /* CONFIG_X86_32 */ static void fpu_finit(struct fpu *fpu) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ab9ad48b653..550bf45236f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -362,9 +362,6 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); */ static void __cpuinit __xsave_init(void) { - if (!cpu_has_xsave) - return; - set_in_cr4(X86_CR4_OSXSAVE); /* @@ -429,7 +426,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -void __ref xsave_cntxt_init(void) +static void __cpuinit xsave_cntxt_init(void) { unsigned int eax, ebx, ecx, edx; @@ -466,10 +463,13 @@ void __ref xsave_cntxt_init(void) void __cpuinit xsave_init(void) { + if (!cpu_has_xsave) + return; + /* * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - init_thread_xstate(); + xsave_cntxt_init(); __xsave_init(); } -- cgit v1.2.3 From 97e80a70db689fb1e876df9f12305cc72f85ca53 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:53 +0200 Subject: x86, xsave: Introduce xstate enable functions The patch renames xsave_cntxt_init() and __xsave_init() into xstate_enable_boot_cpu() and xstate_enable() as this names are more meaningful. It also removes the duplicate xcr setup for the boot cpu. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 550bf45236f..2322f586c05 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,15 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static void __cpuinit __xsave_init(void) +static inline void xstate_enable(u64 mask) { set_in_cr4(X86_CR4_OSXSAVE); - - /* - * Enable all the features that the HW is capable of - * and the Linux kernel is aware of. - */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); } /* @@ -426,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xsave_cntxt_init(void) +static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -443,7 +438,8 @@ static void __cpuinit xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - __xsave_init(); + + xstate_enable(pcntxt_mask); /* * Recompute the context size for enabled features @@ -470,6 +466,7 @@ void __cpuinit xsave_init(void) * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - xsave_cntxt_init(); - __xsave_init(); + xstate_enable_boot_cpu(); + else + xstate_enable(pcntxt_mask); } -- cgit v1.2.3 From ee813d53a8e980a3a28318efb8935d45723f5211 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:54 +0200 Subject: x86, xsave: Check cpuid level for XSTATE_CPUID (0x0d) The patch introduces the XSTATE_CPUID macro and adds a check that tests if XSTATE_CPUID exists. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-4-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 2 ++ arch/x86/kernel/xsave.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 4d3b5d1fc02..d1b5f3a2fa2 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -4,6 +4,8 @@ #include #include +#define XSTATE_CPUID 0x0000000d + #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 2322f586c05..5adb7fb408f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -379,7 +379,7 @@ static void setup_xstate_features(void) xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); do { - cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); if (eax == 0) break; @@ -425,7 +425,12 @@ static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { @@ -444,7 +449,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) /* * Recompute the context size for enabled features */ - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; update_regset_xstate_info(xstate_size, pcntxt_mask); -- cgit v1.2.3 From 45c2d7f46211a0b1f6b425c59575c53145afc4b4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:55 +0200 Subject: x86, xsave: Make init_xstate_buf static The pointer is only used in xsave.c. Making it static. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 1 - arch/x86/kernel/xsave.c | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index d1b5f3a2fa2..0ae6b996198 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,7 +27,6 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; -extern struct xsave_struct *init_xstate_buf; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_init(void); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 5adb7fb408f..3b44a9b1eca 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -16,6 +16,11 @@ */ u64 pcntxt_mask; +/* + * Represents init state for the supported extended state. + */ +static struct xsave_struct *init_xstate_buf; + struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION struct _fpx_sw_bytes fx_sw_reserved_ia32; @@ -348,11 +353,6 @@ static void prepare_fx_sw_frame(void) #endif } -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - #ifdef CONFIG_X86_64 unsigned int sig_xstate_size = sizeof(struct _fpstate); #endif -- cgit v1.2.3 From 4995b9dba908436c1611454f9bd2cb3ddf6babee Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:56 +0200 Subject: x86, xsave: Add __init attribute to setup_xstate_features() This is called only from initialization code. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-6-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3b44a9b1eca..cfc7901ee94 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -370,7 +370,7 @@ static inline void xstate_enable(u64 mask) * Record the offsets and sizes of different state managed by the xsave * memory layout. */ -static void setup_xstate_features(void) +static void __init setup_xstate_features(void) { int eax, ebx, ecx, edx, leaf = 0x2; -- cgit v1.2.3 From 1cff92d8fdb27684308864d9cdb324bee43b40ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 21 Jul 2010 14:23:10 -0700 Subject: x86, xsave: Make xstate_enable_boot_cpu() __init, protect on CPU 0 xstate_enable_boot_cpu() is, as the name implies, only used on the boot CPU; furthermore, it invokes alloc_bootmem(), which is __init; hence it needs to be tagged __init rather than __cpuinit. Furthermore, it is *not* safe in the long run to rely on CPU 0 only coming online during the early boot -- at some point we're going to support offlining (and re-onlining) the boot CPU, and at that point we must not call xstate_enable_boot_cpu() again. The code is a fair bit more obscure than one would like, because the __ref overrides aren't quite powerful enough. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha Cc: Robert Richter LKML-Reference: <4C476236.1020302@zytor.com> --- arch/x86/kernel/xsave.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index cfc7901ee94..b2549c3eb2c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,10 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static inline void xstate_enable(u64 mask) +static inline void xstate_enable(void) { set_in_cr4(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* @@ -421,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xstate_enable_boot_cpu(void) +static void __init xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -444,7 +444,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xstate_enable(pcntxt_mask); + xstate_enable(); /* * Recompute the context size for enabled features @@ -462,16 +462,22 @@ static void __cpuinit xstate_enable_boot_cpu(void) pcntxt_mask, xstate_size); } +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ void __cpuinit xsave_init(void) { + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + if (!cpu_has_xsave) return; - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - xstate_enable_boot_cpu(); - else - xstate_enable(pcntxt_mask); + this_func = next_func; + next_func = xstate_enable; + this_func(); } -- cgit v1.2.3 From 8c06585d6431addadd94903843dfbcd315b42d4e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:26 -0400 Subject: x86: Remove redundant K6 MSRs MSR_K6_EFER is unused, and MSR_K6_STAR is redundant with MSR_STAR. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-1-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 2 -- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae431862..6068e0e06e0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -159,8 +159,6 @@ #define MSR_K7_FID_VID_STATUS 0xc0010042 /* K6 MSRs */ -#define MSR_K6_EFER 0xc0000080 -#define MSR_K6_STAR 0xc0000081 #define MSR_K6_WHCR 0xc0000082 #define MSR_K6_UWCCR 0xc0000085 #define MSR_K6_EPMR 0xc0000086 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd2..24a22069629 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -130,7 +130,7 @@ static struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ } direct_access_msrs[] = { - { .index = MSR_K6_STAR, .always = true }, + { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 { .index = MSR_GS_BASE, .always = true }, @@ -2431,7 +2431,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = tsc_offset + native_read_tsc(); break; } - case MSR_K6_STAR: + case MSR_STAR: *data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 @@ -2555,7 +2555,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; } - case MSR_K6_STAR: + case MSR_STAR: svm->vmcb->save.star = data; break; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe7..b42ad25d564 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -231,14 +231,14 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* - * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -1057,10 +1057,10 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0 && vmx->rdtscp_enabled) move_msr_up(vmx, index, save_nmsrs++); /* - * MSR_K6_STAR is only needed on long mode guests, and only + * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_K6_STAR); + index = __find_msr_index(vmx, MSR_STAR); if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05d571f6f19..6127468ebbd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -671,7 +671,7 @@ static u32 msrs_to_save[] = { HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, + MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif -- cgit v1.2.3 From cfaa71ee9794472598d3966c3315cd6bd8f953d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:27 -0400 Subject: x86: Use symbolic MSR names Use symbolic MSR names instead of hardcoding the MSR index. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-2-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/realmode/wakeup.S | 2 +- arch/x86/kernel/verify_cpu_64.S | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e29601..28595d6df47 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -104,7 +104,7 @@ _start: movl %eax, %ecx orl %edx, %ecx jz 1f - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a..56a8c2a867d 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S @@ -31,6 +31,7 @@ */ #include +#include verify_cpu: pushfl # Save caller passed flags @@ -88,7 +89,7 @@ verify_cpu_sse_test: je verify_cpu_sse_ok test %di,%di jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR + movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr -- cgit v1.2.3 From 650fb4393dff543bc980d361555c489fbdeed088 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:28 -0400 Subject: x86-64: Simplify loading initial_gs Load initial_gs as two 32-bit values instead of splitting a 64-bit value. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-3-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a..239046bd447 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) * init data section till per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq initial_gs(%rip),%rax - movq %rax,%rdx - shrq $32,%rdx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx wrmsr /* esi is pointer to real mode structure with interesting info. -- cgit v1.2.3 From 4c21adf26f8fcf86a755b9b9f55c2e9fd241e1fb Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Jul 2010 16:59:34 -0700 Subject: x86 cpufreq, perf: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers. trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Renninger Cc: davej@codemonkey.org.uk Signed-off-by: Ingo Molnar Cc: Dave Jones Acked-by: Arjan van de Ven Cc: Robert Schoene Tested-by: Robert Schoene Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40e..cee5263927c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32..787572d43d9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit v1.2.3 From 18f19aa62a267f2f759e278018f1032adf4c3774 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 14 May 2010 12:38:24 +0100 Subject: xen: Add support for HVM hypercalls. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Sheng Yang Signed-off-by: Stefano Stabellini --- arch/x86/include/asm/xen/hypercall.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9c371e4a9fa..7fda040a76c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) return _hypercall2(int, nmi_op, op, arg); } +static inline unsigned long __must_check +HYPERVISOR_hvm_op(int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { -- cgit v1.2.3 From bee6ab53e652a414af20392899879b58cd80d033 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:39:33 +0100 Subject: x86: early PV on HVM features initialization. Initialize basic pv on hvm features adding a new Xen HVM specific hypervisor_x86 structure. Don't try to initialize xen-kbdfront and xen-fbfront when running on HVM because the backends are not available. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/hypervisor.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 1 + arch/x86/xen/enlighten.c | 100 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 70abda7058c..ff2546ce717 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; #endif diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc56a8..bffd47c10fe 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,6 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, + &x86_hyper_xen_hvm, }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a..09b36e9d507 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,9 @@ #include #include #include +#include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -76,6 +79,8 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -1206,3 +1211,98 @@ asmlinkage void __init xen_start_kernel(void) x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } + +static uint32_t xen_cpuid_base(void) +{ + uint32_t base, eax, ebx, ecx, edx; + char signature[13]; + + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &ebx, &ecx, &edx); + *(uint32_t *)(signature + 0) = ebx; + *(uint32_t *)(signature + 4) = ecx; + *(uint32_t *)(signature + 8) = edx; + signature[12] = 0; + + if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) + return base; + } + + return 0; +} + +static int init_hvm_pv_info(int *major, int *minor) +{ + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + *major = eax >> 16; + *minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info = xen_info; + pv_info.kernel_rpl = 0; + + xen_domain_type = XEN_HVM_DOMAIN; + + return 0; +} + +static void __init init_shared_info(void) +{ + struct xen_add_to_physmap xatp; + struct shared_info *shared_info_page; + + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +} + +static void __init xen_hvm_guest_init(void) +{ + int r; + int major, minor; + + r = init_hvm_pv_info(&major, &minor); + if (r < 0) + return; + + init_shared_info(); +} + +static bool __init xen_hvm_platform(void) +{ + if (xen_pv_domain()) + return false; + + if (!xen_cpuid_base()) + return false; + + return true; +} + +const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_hvm_platform, + .init_platform = xen_hvm_guest_init, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); -- cgit v1.2.3 From 38e20b07efd541a959de367dc90a17f92ce2e8a6 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:40:51 +0100 Subject: x86/xen: event channels delivery on HVM. Set the callback to receive evtchns from Xen, using the callback vector delivery mechanism. The traditional way for receiving event channel notifications from Xen is via the interrupts from the platform PCI device. The callback vector is a newer alternative that allow us to receive notifications on any vcpu and doesn't need any PCI support: we allocate a vector exclusively to receive events, in the vector handler we don't need to interact with the vlapic, therefore we avoid a VMEXIT. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/irq_vectors.h | 3 +++ arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ arch/x86/xen/enlighten.c | 28 ++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 2 ++ 5 files changed, 39 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8767d99c4f6..e2ca3009255 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -125,6 +125,9 @@ */ #define MCE_SELF_VECTOR 0xeb +/* Xen vector callback to receive events in a HVM domain */ +#define XEN_HVM_EVTCHN_CALLBACK 0xe9 + #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf15..6b196834a0d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1166,6 +1166,9 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0697ff13983..490ae2bb18a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 09b36e9d507..b211a04c4b2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -11,6 +11,7 @@ * Jeremy Fitzhardinge , XenSource Inc, 2007 */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include @@ -80,6 +82,8 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info @@ -1277,6 +1281,24 @@ static void __init init_shared_info(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; } +static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { + .notifier_call = xen_hvm_cpu_notify, +}; + static void __init xen_hvm_guest_init(void) { int r; @@ -1287,6 +1309,12 @@ static void __init xen_hvm_guest_init(void) return; init_shared_info(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + register_cpu_notifier(&xen_hvm_cpu_notifier); + have_vcpu_info_placement = 0; + x86_init.irqs.intr_init = xen_init_IRQ; } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bc..0d0e0e6a747 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -38,6 +38,8 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); + void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); -- cgit v1.2.3 From 016b6f5fe8398b0291cece60b749d7c930a2e09c Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:45:07 +0100 Subject: xen: Add suspend/resume support for PV on HVM guests. Suspend/resume requires few different things on HVM: the suspend hypercall is different; we don't need to save/restore memory related settings; except the shared info page and the callback mechanism. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 24 ++++++++++++++++++------ arch/x86/xen/suspend.c | 6 ++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b211a04c4b2..127c95c8d15 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1262,13 +1262,15 @@ static int init_hvm_pv_info(int *major, int *minor) return 0; } -static void __init init_shared_info(void) +void xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; - struct shared_info *shared_info_page; + static struct shared_info *shared_info_page = 0; - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; @@ -1278,7 +1280,17 @@ static void __init init_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + } } static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, @@ -1308,7 +1320,7 @@ static void __init xen_hvm_guest_init(void) if (r < 0) return; - init_shared_info(); + xen_hvm_init_shared_info(); if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index a9c66110803..d07479c340f 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -26,6 +26,12 @@ void xen_pre_suspend(void) BUG(); } +void xen_hvm_post_suspend(int suspend_cancelled) +{ + xen_hvm_init_shared_info(); + xen_callback_vector(); +} + void xen_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d0e0e6a747..01c9dd38652 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -39,6 +39,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); +void xen_hvm_init_shared_info(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.2.3 From 409771d258e9dd71c30f3c9520fd2b796ffc40f0 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:48:19 +0100 Subject: x86: Use xen_vcpuop_clockevent, xen_clocksource and xen wallclock. Use xen_vcpuop_clockevent instead of hpet and APIC timers as main clockevent device on all vcpus, use the xen wallclock time as wallclock instead of rtc and use xen_clocksource as clocksource. The pv clock algorithm needs to work correctly for the xen_clocksource and xen wallclock to be usable, only modern Xen versions offer a reliable pv clock in HVM guests (XENFEAT_hvm_safe_pvclock). Using the hpet as clocksource means a VMEXIT every time we read/write to the hpet mmio addresses, pvclock give us a better rating without VMEXITs. Same goes for the xen wallclock and xen_vcpuop_clockevent Signed-off-by: Stefano Stabellini Signed-off-by: Don Dutile Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 14 ++---------- arch/x86/xen/suspend.c | 6 +++++ arch/x86/xen/time.c | 58 +++++++++++++++++++++++++++++++++++++++++++----- arch/x86/xen/xen-ops.h | 7 ++---- 4 files changed, 63 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 127c95c8d15..a9017296388 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -935,10 +935,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, }; -static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, -}; - static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, @@ -1076,7 +1072,6 @@ asmlinkage void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; @@ -1084,13 +1079,7 @@ asmlinkage void __init xen_start_kernel(void) x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; - x86_init.timers.timer_init = xen_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - x86_cpuinit.setup_percpu_clockev = x86_init_noop; - - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + xen_init_time_ops(); /* * Set up some pagetable state before starting to set any ptes. @@ -1327,6 +1316,7 @@ static void __init xen_hvm_guest_init(void) register_cpu_notifier(&xen_hvm_cpu_notifier); have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d07479c340f..1d789d56877 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -28,8 +28,14 @@ void xen_pre_suspend(void) void xen_hvm_post_suspend(int suspend_cancelled) { + int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } } void xen_post_suspend(int suspend_cancelled) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3c6c59ed30..4780e55886a 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -160,7 +161,7 @@ static void do_stolen_accounting(void) * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED * states. */ -unsigned long long xen_sched_clock(void) +static unsigned long long xen_sched_clock(void) { struct vcpu_runstate_info state; cycle_t now; @@ -195,7 +196,7 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ -unsigned long xen_tsc_khz(void) +static unsigned long xen_tsc_khz(void) { struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; @@ -230,7 +231,7 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -unsigned long xen_get_wallclock(void) +static unsigned long xen_get_wallclock(void) { struct timespec ts; @@ -238,7 +239,7 @@ unsigned long xen_get_wallclock(void) return ts.tv_sec; } -int xen_set_wallclock(unsigned long now) +static int xen_set_wallclock(unsigned long now) { /* do nothing for domU */ return -1; @@ -473,7 +474,11 @@ void xen_timer_resume(void) } } -__init void xen_time_init(void) +static const struct pv_time_ops xen_time_ops __initdata = { + .sched_clock = xen_sched_clock, +}; + +static __init void xen_time_init(void) { int cpu = smp_processor_id(); struct timespec tp; @@ -497,3 +502,46 @@ __init void xen_time_init(void) xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } + +__init void xen_init_time_ops(void) +{ + pv_time_ops = xen_time_ops; + + x86_init.timers.timer_init = xen_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} + +static void xen_hvm_setup_cpu_clockevents(void) +{ + int cpu = smp_processor_id(); + xen_setup_runstate_info(cpu); + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); +} + +__init void xen_hvm_init_time_ops(void) +{ + /* vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 */ + if (!xen_have_vector_callback && num_present_cpus() > 1) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { + printk(KERN_INFO "Xen doesn't support pvclock on HVM," + "disable pv timer\n"); + return; + } + + pv_time_ops = xen_time_ops; + x86_init.timers.setup_percpu_clockev = xen_time_init; + x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} + diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 01c9dd38652..089d18923d2 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -49,11 +49,8 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); -unsigned long xen_tsc_khz(void); -void __init xen_time_init(void); -unsigned long xen_get_wallclock(void); -int xen_set_wallclock(unsigned long time); -unsigned long long xen_sched_clock(void); +void __init xen_init_time_ops(void); +void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.2.3 From c1c5413ad58cb73267d328e6020268aa2e50d8ca Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:44:30 +0100 Subject: x86: Unplug emulated disks and nics. Add a xen_emul_unplug command line option to the kernel to unplug xen emulated disks and nics. Set the default value of xen_emul_unplug depending on whether or not the Xen PV frontends and the Xen platform PCI driver have been compiled for this kernel (modules or built-in are both OK). The user can specify xen_emul_unplug=ignore to enable PV drivers on HVM even if the host platform doesn't support unplug. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/platform-pci-unplug.c | 135 +++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 4 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/platform-pci-unplug.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f..93095468598 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o + grant-table.o suspend.o platform-pci-unplug.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a9017296388..157c93b62dd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1314,6 +1314,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; register_cpu_notifier(&xen_hvm_cpu_notifier); + xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c new file mode 100644 index 00000000000..2f7f3fb3477 --- /dev/null +++ b/arch/x86/xen/platform-pci-unplug.c @@ -0,0 +1,135 @@ +/****************************************************************************** + * platform-pci-unplug.c + * + * Xen platform PCI device driver + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include + +#include + +#define XEN_PLATFORM_ERR_MAGIC -1 +#define XEN_PLATFORM_ERR_PROTOCOL -2 +#define XEN_PLATFORM_ERR_BLACKLIST -3 + +/* store the value of xen_emul_unplug after the unplug is done */ +int xen_platform_pci_unplug; +EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +static int xen_emul_unplug; + +static int __init check_platform_magic(void) +{ + short magic; + char protocol; + + magic = inw(XEN_IOPORT_MAGIC); + if (magic != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); + return XEN_PLATFORM_ERR_MAGIC; + } + + protocol = inb(XEN_IOPORT_PROTOVER); + + printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", + protocol); + + switch (protocol) { + case 1: + outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); + outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); + if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform: blacklisted by host\n"); + return XEN_PLATFORM_ERR_BLACKLIST; + } + break; + default: + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + return XEN_PLATFORM_ERR_PROTOCOL; + } + + return 0; +} + +void __init xen_unplug_emulated_devices(void) +{ + int r; + + /* check the version of the xen platform PCI device */ + r = check_platform_magic(); + /* If the version matches enable the Xen platform PCI driver. + * Also enable the Xen platform PCI driver if the version is really old + * and the user told us to ignore it. */ + if (r && !(r == XEN_PLATFORM_ERR_MAGIC && + (xen_emul_unplug & XEN_UNPLUG_IGNORE))) + return; + /* Set the default value of xen_emul_unplug depending on whether or + * not the Xen PV frontends and the Xen platform PCI driver have + * been compiled for this kernel (modules or built-in are both OK). */ + if (!xen_emul_unplug) { + if (xen_must_unplug_nics()) { + printk(KERN_INFO "Netfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated NICs.\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + } + if (xen_must_unplug_disks()) { + printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated disks.\n" + "You might have to change the root device\n" + "from /dev/hd[a-d] to /dev/xvd[a-d]\n" + "in your root= kernel command line option\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + } + } + /* Now unplug the emulated devices */ + if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE)) + outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); + xen_platform_pci_unplug = xen_emul_unplug; +} + +static int __init parse_xen_emul_unplug(char *arg) +{ + char *p, *q; + int l; + + for (p = arg; p; p = q) { + q = strchr(p, ','); + if (q) { + l = q - p; + q++; + } else { + l = strlen(p); + } + if (!strncmp(p, "all", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL; + else if (!strncmp(p, "ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + else if (!strncmp(p, "aux-ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; + else if (!strncmp(p, "nics", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + else if (!strncmp(p, "ignore", l)) + xen_emul_unplug |= XEN_UNPLUG_IGNORE; + else + printk(KERN_WARNING "unrecognised option '%s' " + "in parameter 'xen_emul_unplug'\n", p); + } + return 0; +} +early_param("xen_emul_unplug", parse_xen_emul_unplug); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 089d18923d2..ed776949024 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -40,6 +40,7 @@ void xen_vcpu_restore(void); void xen_callback_vector(void); void xen_hvm_init_shared_info(void); +void __init xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.2.3 From 5915100106b8f14a38053ad6c03a664d208aeaa2 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 17 Jun 2010 14:22:52 +0100 Subject: x86: Call HVMOP_pagetable_dying on exit_mmap. When a pagetable is about to be destroyed, we notify Xen so that the hypervisor can clear the related shadow pagetable. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/xen/mmu.h | 1 + 3 files changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 157c93b62dd..75b479a684f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1318,6 +1318,7 @@ static void __init xen_hvm_guest_init(void) have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce..84648c1bf13 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include @@ -1941,6 +1942,38 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +} + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe6bc7f5ec..fa938c4aa2f 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); +extern void xen_hvm_init_mmu_ops(void); #endif /* _XEN_MMU_H */ -- cgit v1.2.3 From b43275d661baa5f1f72dacd9033d6eda09d9fe87 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 Jul 2010 10:38:45 -0700 Subject: xen/pvhvm: fix build problem when !CONFIG_XEN x86_hyper_xen_hvm is only defined when Xen is enabled in the kernel config. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/cpu/hypervisor.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index bffd47c10fe..5bccedcb912 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_XEN &x86_hyper_xen_hvm, +#endif }; const struct hypervisor_x86 *x86_hyper; -- cgit v1.2.3 From 8c73626ab28527b7eb7f3061c027fbfe530c488c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:18 -0700 Subject: x86: Fix vtime/file timestamp inconsistencies Due to vtime calling vgettimeofday(), its possible that an application could call time();create("stuff",O_RDRW); only to see the file's creation timestamp to be before the value returned by time. A similar way to reproduce the issue is to compare the vsyscall time() with the syscall time(), and observe ordering issues. The modified test case from Oleg Nesterov below can illustrate this: int main(void) { time_t sec1,sec2; do { sec1 = time(&sec2); sec2 = syscall(__NR_time, NULL); } while (sec1 <= sec2); printf("vtime: %d.000000\n", sec1); printf("time: %d.000000\n", sec2); return 0; } The proper fix is to make vtime use the same time value as current_kernel_time() (which is exported via update_vsyscall) instead of vgettime(). Thanks to Jiri Olsa for bringing up the issue and catching bugs in earlier verisons of this fix. Signed-off-by: John Stultz Cc: Jiri Olsa Cc: Oleg Nesterov LKML-Reference: <1279068988-21864-2-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60..dce0c3c5a78 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - struct timeval tv; + unsigned seq; time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, NULL); - result = tv.tv_sec; + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + result = __vsyscall_gtod_data.wall_time_sec; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + if (t) *t = result; return result; -- cgit v1.2.3 From 592913ecb87a9e06f98ddb55b298f1a66bf94c6b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:20 -0700 Subject: time: Kill off CONFIG_GENERIC_TIME Now that all arches have been converted over to use generic time via clocksources or arch_gettimeoffset(), we can remove the GENERIC_TIME config option and simplify the generic code. Signed-off-by: John Stultz LKML-Reference: <1279068988-21864-4-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a6..546b610ad71 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,9 +72,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 -config GENERIC_TIME - def_bool y - config GENERIC_CMOS_UPDATE def_bool y @@ -2046,7 +2043,7 @@ config SCx200 config SCx200HR_TIMER tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" - depends on SCx200 && GENERIC_TIME + depends on SCx200 default y ---help--- This driver provides a clocksource built upon the on-chip -- cgit v1.2.3 From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:23 -0700 Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset update_vsyscall() did not provide the wall_to_monotoinc offset, so arch specific implementations tend to reference wall_to_monotonic directly. This limits future cleanups in the timekeeping core, so this patch fixes the update_vsyscall interface to provide wall_to_monotonic, allowing wall_to_monotonic to be made static as planned in Documentation/feature-removal-schedule.txt Signed-off-by: John Stultz Cc: Martin Schwidefsky Cc: Anton Blanchard Cc: Paul Mackerras Cc: Tony Luck LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dce0c3c5a78..dcbb28c4b69 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,8 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, - u32 mult) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -- cgit v1.2.3 From f12a15be63d1de9a35971f35f06b73088fa25c3a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:27 -0700 Subject: x86: Convert common clocksources to use clocksource_register_hz/khz This converts the most common of the x86 clocksources over to use clocksource_register_hz/khz. Signed-off-by: John Stultz LKML-Reference: <1279068988-21864-11-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 13 +++++++++---- arch/x86/kernel/tsc.c | 5 +---- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d73117..33dbcc4ec5f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -16,7 +16,6 @@ #include #define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ @@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 @@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; + u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -832,9 +831,15 @@ static int hpet_clocksource_register(void) * mult = (hpet_period * 2^shift)/10^6 * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); - clocksource_register(&clocksource_hpet); + /* Need to convert hpet_period (fsec/cyc) to cyc/sec: + * + * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) + * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period + */ + hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + do_div(hpet_freq, hpet_period); + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); return 0; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae184..ce8e5023933 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { .read = read_tsc, .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 @@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) static void __init init_tsc_clocksource(void) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + clocksource_register_khz(&clocksource_tsc, tsc_khz); } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 80a506b8fdcfa868bb53eb740f928217d0966fc1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Jul 2010 17:14:24 +0200 Subject: x86/amd-iommu: Export cache-coherency capability This patch exports the capability of the AMD IOMMU to force cache coherency of DMA transactions through the IOMMU-API. This is required to disable some nasty hacks in KVM when this capability is not available. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 29dd3b9f2f0..fa044e1e30a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, static int amd_iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap) { + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return 1; + } + return 0; } -- cgit v1.2.3 From d2cb214551de8180542a04ec8c86c0c9412c5124 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 26 Mar 2010 15:37:50 -0700 Subject: xen/mmu: inhibit vmap aliases rather than trying to clear them out Rather than trying to deal with aliases once they appear, just completely inhibit them. Mostly the removal of aliases was managable, but it comes unstuck in xen_create_contiguous_region() because it gets executed at interrupt time (as a result of dma_alloc_coherent()), which causes all sorts of confusion in the vmap code, as it was never intended to be run in interrupt context. This has the unfortunate side effect of removing all the unmap batching the vmap code so carefully added, but that can't be helped. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb51402dd99..ef5728dde8f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1015,8 +1016,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, read-only, and can be pinned. */ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { - vm_unmap_aliases(); - xen_mc_batch(); if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -1580,7 +1579,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - vm_unmap_aliases(); if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE && USE_SPLIT_PTLOCKS) @@ -2026,6 +2024,8 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; + + vmap_lazy_unmap = false; } /* Protected by xen_reservation_lock. */ @@ -2165,8 +2165,6 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, memset((void *) vstart, 0, PAGE_SIZE << order); - vm_unmap_aliases(); - spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Zap current PTEs, remembering MFNs. */ @@ -2204,8 +2202,6 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) memset((void *) vstart, 0, PAGE_SIZE << order); - vm_unmap_aliases(); - spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Find start MFN of contiguous extent. */ -- cgit v1.2.3 From bbbe57386e857eb2a8d4abcae71063c819c06ff1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 9 Feb 2010 14:30:55 -0500 Subject: pci-swiotlb-xen: Add glue code to setup dma_ops utilizing xen_swiotlb_* functions. We add the glue code that sets up a dma_ops structure with the xen_swiotlb_* functions. The code turns on xen_swiotlb flag when it detects it is running under Xen and it is either in privileged mode or the iommu=soft flag was passed in. It also disables the bare-metal SWIOTLB if the Xen-SWIOTLB has been enabled. Note: The Xen-SWIOTLB is only built when CONFIG_XEN is enabled. Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Jeremy Fitzhardinge Cc: FUJITA Tomonori Cc: Albert Herranz Cc: Ian Campbell --- arch/x86/include/asm/xen/swiotlb-xen.h | 14 ++++++++ arch/x86/xen/Makefile | 1 + arch/x86/xen/pci-swiotlb-xen.c | 58 ++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 arch/x86/include/asm/xen/swiotlb-xen.h create mode 100644 arch/x86/xen/pci-swiotlb-xen.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h new file mode 100644 index 00000000000..1be1ab7d6a4 --- /dev/null +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_SWIOTLB_XEN_H +#define _ASM_X86_SWIOTLB_XEN_H + +#ifdef CONFIG_SWIOTLB_XEN +extern int xen_swiotlb; +extern int __init pci_xen_swiotlb_detect(void); +extern void __init pci_xen_swiotlb_init(void); +#else +#define xen_swiotlb (0) +static inline int __init pci_xen_swiotlb_detect(void) { return 0; } +static inline void __init pci_xen_swiotlb_init(void) { } +#endif + +#endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f..32af238055c 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c new file mode 100644 index 00000000000..a013ec9d0c5 --- /dev/null +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -0,0 +1,58 @@ +/* Glue code to lib/swiotlb-xen.c */ + +#include +#include + +#include +#include + +int xen_swiotlb __read_mostly; + +static struct dma_map_ops xen_swiotlb_dma_ops = { + .mapping_error = xen_swiotlb_dma_mapping_error, + .alloc_coherent = xen_swiotlb_alloc_coherent, + .free_coherent = xen_swiotlb_free_coherent, + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg_attrs, + .unmap_sg = xen_swiotlb_unmap_sg_attrs, + .map_page = xen_swiotlb_map_page, + .unmap_page = xen_swiotlb_unmap_page, + .dma_supported = xen_swiotlb_dma_supported, +}; + +/* + * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use xen_swiotlb (by the boot + * option). + */ +int __init pci_xen_swiotlb_detect(void) +{ + + /* If running as PV guest, either iommu=soft, or swiotlb=force will + * activate this IOMMU. If running as PV privileged, activate it + * irregardlesss. + */ + if ((xen_initial_domain() || swiotlb || swiotlb_force) && + (xen_pv_domain())) + xen_swiotlb = 1; + + /* If we are running under Xen, we MUST disable the native SWIOTLB. + * Don't worry about swiotlb_force flag activating the native, as + * the 'swiotlb' flag is the only one turning it on. */ + if (xen_pv_domain()) + swiotlb = 0; + + return xen_swiotlb; +} + +void __init pci_xen_swiotlb_init(void) +{ + if (xen_swiotlb) { + xen_swiotlb_init(1); + dma_ops = &xen_swiotlb_dma_ops; + } +} -- cgit v1.2.3 From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 17:01:49 -0700 Subject: x86: Add memory modify constraints to xchg() and cmpxchg() xchg() and cmpxchg() modify their memory operands, not merely read them. For some versions of gcc the "memory" clobber has apparently dealt with the situation, but not for all. Originally-by: Linus Torvalds Signed-off-by: H. Peter Anvin Cc: Glauber Costa Cc: Avi Kivity Cc: Peter Palfrader Cc: Greg KH Cc: Alan Cox Cc: Zachary Amsden Cc: Marcelo Tosatti Cc: LKML-Reference: <4C4F7277.8050306@zytor.com> --- arch/x86/include/asm/cmpxchg_32.h | 68 +++++++++++++++++++-------------------- arch/x86/include/asm/cmpxchg_64.h | 40 +++++++++++------------ 2 files changed, 54 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 8859e12dd3c..c1cf59d72f0 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -27,20 +27,20 @@ struct __xchg_dummy { switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -70,14 +70,14 @@ static inline void __set_64bit(unsigned long long *ptr, unsigned int low, unsigned int high) { asm volatile("\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%0)\n\t" + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + LOCK_PREFIX "cmpxchg8b (%1)\n\t" "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) + : "=m" (*ptr) + : "D" (ptr), + "b" (low), + "c" (high) : "ax", "dx", "memory"); } @@ -121,21 +121,21 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ @@ -180,12 +180,12 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile(LOCK_PREFIX "cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } @@ -195,12 +195,12 @@ static inline unsigned long long __cmpxchg64_local(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile("cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile("cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 485ae415fae..b92f147339f 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -26,26 +26,26 @@ extern void __cmpxchg_wrong_size(void); switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %k0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 8: \ asm volatile("xchgq %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -71,27 +71,27 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %k1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %k2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 8: \ - asm volatile(lock "cmpxchgq %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ -- cgit v1.2.3 From c7f52cdc2f3e1733d3864e439ac2e92edd99ef31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 22 Jul 2010 22:58:01 -0700 Subject: support multiple .discard.* sections to avoid section type conflicts gcc 4.4.4 will complain if you use a .discard section for both text and data ("causes a section type conflict"). Add support for ".discard.*" sections, and use .discard.text for a dummy function in the x86 RESERVE_BRK() macro. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 86b1506f417..ef292c792d7 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard) __used \ + static void __section(.discard.text) __used \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ -- cgit v1.2.3 From 69309a05907546fb686b251d4ab041c26afe1e1d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 23:29:52 -0700 Subject: x86, asm: Clean up and simplify set_64bit() Clean up and simplify set_64bit(). This code is quite old (1.3.11) and contains a fair bit of auxilliary machinery that current versions of gcc handle just fine automatically. Worse, the auxilliary machinery can actually cause an unnecessary spill to memory. Furthermore, the loading of the old value inside the loop in the 32-bit case is unnecessary: if the value doesn't match, the CMPXCHG8B instruction will already have loaded the "new previous" value for us. Clean up the comment, too, and remove page references to obsolete versions of the Intel SDM. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 67 ++++++++++++--------------------------- arch/x86/include/asm/cmpxchg_64.h | 4 +-- 2 files changed, 21 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index c1cf59d72f0..20955ea7bc1 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -53,60 +53,33 @@ struct __xchg_dummy { __xchg((v), (ptr), sizeof(*ptr)) /* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) + * CMPXCHG8B only writes to the target if we had the previous + * value in registers, otherwise it acts as a read and gives us the + * "new previous" value. That is why there is a loop. Preloading + * EDX:EAX is a performance optimization: in the common case it means + * we need only one locked operation. * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. + * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very + * least an FPU save and/or %cr0.ts manipulation. + * + * cmpxchg8b must be used with the lock prefix here to allow the + * instruction to be executed atomically. We need to have the reader + * side to see the coherent 64bit value. */ -static inline void __set_64bit(unsigned long long *ptr, - unsigned int low, unsigned int high) +static inline void set_64bit(volatile u64 *ptr, u64 value) { + u32 low = value; + u32 high = value >> 32; + u64 prev = *ptr; + asm volatile("\n1:\t" - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%1)\n\t" + LOCK_PREFIX "cmpxchg8b %0\n\t" "jnz 1b" - : "=m" (*ptr) - : "D" (ptr), - "b" (low), - "c" (high) - : "ax", "dx", "memory"); -} - -static inline void __set_64bit_constant(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); -} - -#define ll_low(x) *(((unsigned int *)&(x)) + 0) -#define ll_high(x) *(((unsigned int *)&(x)) + 1) - -static inline void __set_64bit_var(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, ll_low(value), ll_high(value)); + : "=m" (*ptr), "+A" (prev) + : "b" (low), "c" (high) + : "memory"); } -#define set_64bit(ptr, value) \ - (__builtin_constant_p((value)) \ - ? __set_64bit_constant((ptr), (value)) \ - : __set_64bit_var((ptr), (value))) - -#define _set_64bit(ptr, value) \ - (__builtin_constant_p(value) \ - ? __set_64bit(ptr, (unsigned int)(value), \ - (unsigned int)((value) >> 32)) \ - : __set_64bit(ptr, ll_low((value)), ll_high((value)))) - extern void __cmpxchg_wrong_size(void); /* diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index b92f147339f..9596e7c6196 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -5,13 +5,11 @@ #define __xg(x) ((volatile long *)(x)) -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; } -#define _set_64bit set_64bit - extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); -- cgit v1.2.3 From 18642a57df02a044b91219d3176128996ddc81a5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 23:52:29 -0700 Subject: x86, vdso: Don't quote $nm in the script for checking vdso references Don't quote $nm in the script for checking the vdso for external references. Doing so breaks multiword constructs, like using CROSS_COMPILE='ccache '. Reported-by: Stephen Rothwell Signed-off-by: H. Peter Anvin LKML-Reference: <20100728134252.2e4c27cf.sfr@canb.auug.org.au> --- arch/x86/vdso/checkundef.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh index 490be1c38f9..7ee90a9b549 100755 --- a/arch/x86/vdso/checkundef.sh +++ b/arch/x86/vdso/checkundef.sh @@ -1,7 +1,7 @@ #!/bin/sh nm="$1" file="$2" -"$nm" "$file" | grep '^ *U' > /dev/null 2>&1 +$nm "$file" | grep '^ *U' > /dev/null 2>&1 if [ $? -eq 1 ]; then exit 0 else -- cgit v1.2.3 From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:25 -0500 Subject: fanotify: fanotify_init syscall declaration This patch defines a new syscall fanotify_init() of the form: int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags, unsigned int priority) This syscall is used to create and fanotify group. This is very similar to the inotify_init() syscall. Signed-off-by: Eric Paris --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa..586cb3be2e3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -842,4 +842,5 @@ ia32_sys_call_table: .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open .quad compat_sys_recvmmsg + .quad sys_fanotify_init ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a..981c7e7ad80 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -343,10 +343,11 @@ #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 +#define __NR_fanotify_init 338 #ifdef __KERNEL__ -#define NR_syscalls 338 +#define NR_syscalls 339 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81..4f23e04bdb3 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 299 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_fanotify_init 300 +__SYSCALL(__NR_fanotify_init, sys_fanotify_init) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b372934121..e38793b50e1 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_fanotify_init -- cgit v1.2.3 From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:26 -0500 Subject: fanotify: sys_fanotify_mark declartion This patch simply declares the new sys_fanotify_mark syscall int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask, int dfd const char *pathname) Signed-off-by: Eric Paris --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/ia32/sys_ia32.c | 9 +++++++++ arch/x86/include/asm/sys_ia32.h | 3 +++ arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 6 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 586cb3be2e3..17cf65c9480 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -843,4 +843,5 @@ ia32_sys_call_table: .quad sys_perf_event_open .quad compat_sys_recvmmsg .quad sys_fanotify_init + .quad sys32_fanotify_mark ia32_syscall_end: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 626be156d88..3d093311d5e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); } + +asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags, + u32 mask_lo, u32 mask_hi, + int fd, const char __user *pathname) +{ + return sys_fanotify_mark(fanotify_fd, flags, + ((u64)mask_hi << 32) | mask_lo, + fd, pathname); +} diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3ad421784ae..cf4e2e381cb 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *); /* ia32/ipc32.c */ asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); + +asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int, + const char __user *); #endif /* _ASM_X86_SYS_IA32_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 981c7e7ad80..80b799cd74f 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -344,10 +344,11 @@ #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 #define __NR_fanotify_init 338 +#define __NR_fanotify_mark 339 #ifdef __KERNEL__ -#define NR_syscalls 339 +#define NR_syscalls 340 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4f23e04bdb3..5b7b1d58561 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -665,6 +665,8 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __NR_fanotify_init 300 __SYSCALL(__NR_fanotify_init, sys_fanotify_init) +#define __NR_fanotify_mark 301 +__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index e38793b50e1..07ad5eb7cc5 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -338,3 +338,4 @@ ENTRY(sys_call_table) .long sys_perf_event_open .long sys_recvmmsg .long sys_fanotify_init + .long sys_fanotify_mark -- cgit v1.2.3 From d78d671db478eb8b14c78501c0cee1cc7baf6967 Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:30 +0200 Subject: x86, cpu: AMD errata checking framework Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that have an OSVW id assigned, which it takes as first argument. Both take a variable number of family-specific model-stepping ranges created by AMD_MODEL_RANGE(). Iff an erratum has an OSVW id, OSVW is available on the CPU, and the OSVW id is known to the hardware, it is used to determine whether an erratum is present. Otherwise, the model-stepping ranges are matched against the current CPU to find out whether the erratum applies. For certain special errata, the code using this framework might have to conduct further checks to make sure an erratum is really (not) present. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 18 ++++++++++++ arch/x86/kernel/cpu/amd.c | 60 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7e5c6a60b8e..5084c2f5ac2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1025,4 +1025,22 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, return ratio; } +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12b9cff047c..80665410b06 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -609,3 +609,63 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 8) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} -- cgit v1.2.3 From 9d8888c2a214aece2494a49e699a097c2ba9498b Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:31 +0200 Subject: x86, cpu: Clean up AMD erratum 400 workaround Remove check_c1e_idle() and use the new AMD errata checking framework instead. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-2-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 5 +++++ arch/x86/kernel/process.c | 39 ++------------------------------------- 3 files changed, 8 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5084c2f5ac2..eebdc1fde3d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, * AMD errata checking */ #ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_400[]; extern bool cpu_has_amd_erratum(const int *); #define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 80665410b06..a62a4ae7a11 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -628,6 +628,11 @@ cpu_dev_register(amd_cpu_dev); * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); */ +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + + bool cpu_has_amd_erratum(const int *erratum) { struct cpuinfo_x86 *cpu = ¤t_cpu_data; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32..553b02f1309 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} - static cpumask_var_t c1e_mask; static int c1e_detected; @@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else -- cgit v1.2.3 From 1be85a6d93f4207d8c2c6238c4a96895e28cefba Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:32 +0200 Subject: x86, cpu: Use AMD errata checking framework for erratum 383 Use the AMD errata checking framework instead of open-coding the test. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-3-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 2 ++ arch/x86/kvm/svm.c | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eebdc1fde3d..d85637bb950 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, * AMD errata checking */ #ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_383[]; extern const int amd_erratum_400[]; extern bool cpu_has_amd_erratum(const int *); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a62a4ae7a11..30f30dcbdb8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,6 +632,8 @@ const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); bool cpu_has_amd_erratum(const int *erratum) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd2..03b534b34ee 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -383,8 +383,7 @@ static void svm_init_erratum_383(void) int err; u64 val; - /* Only Fam10h is affected */ - if (boot_cpu_data.x86 != 0x10) + if (!cpu_has_amd_erratum(amd_erratum_383)) return; /* Use _safe variants to not break nested virtualization */ -- cgit v1.2.3 From 4532b305e8f0c238dd73048068ff8a6dd1380291 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 15:18:35 -0700 Subject: x86, asm: Clean up and simplify Remove the __xg() hack to create a memory barrier near xchg and cmpxchg; it has been there since 1.3.11 but should not be necessary with "asm volatile" and a "memory" clobber, neither of which were there in the original implementation. However, we *should* make this a volatile reference. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 75 ++++++++++++++++++++++----------------- arch/x86/include/asm/cmpxchg_64.h | 61 +++++++++++++++++++++---------- 2 files changed, 84 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 20955ea7bc1..f5bd1fd388f 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -11,38 +11,42 @@ extern void __xchg_wrong_size(void); /* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. */ - -struct __xchg_dummy { - unsigned long a[100]; -}; -#define __xg(x) ((struct __xchg_dummy *)(x)) - #define __xchg(x, ptr, size) \ ({ \ __typeof(*(ptr)) __x = (x); \ switch (size) { \ case 1: \ - asm volatile("xchgb %b0,%1" \ - : "=q" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile("xchgw %w0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 4: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ default: \ __xchg_wrong_size(); \ } \ @@ -94,23 +98,32 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile(lock "cmpxchgw %w2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 4: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ default: \ __cmpxchg_wrong_size(); \ } \ @@ -148,31 +161,27 @@ extern void __cmpxchg_wrong_size(void); (unsigned long long)(n))) #endif -static inline unsigned long long __cmpxchg64(volatile void *ptr, - unsigned long long old, - unsigned long long new) +static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - unsigned long long prev; + u64 prev; asm volatile(LOCK_PREFIX "cmpxchg8b %1" : "=A" (prev), - "+m" (*__xg(ptr)) - : "b" ((unsigned long)new), - "c" ((unsigned long)(new >> 32)), + "+m" (*ptr) + : "b" ((u32)new), + "c" ((u32)(new >> 32)), "0" (old) : "memory"); return prev; } -static inline unsigned long long __cmpxchg64_local(volatile void *ptr, - unsigned long long old, - unsigned long long new) +static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - unsigned long long prev; + u64 prev; asm volatile("cmpxchg8b %1" : "=A" (prev), - "+m" (*__xg(ptr)) - : "b" ((unsigned long)new), - "c" ((unsigned long)(new >> 32)), + "+m" (*ptr) + : "b" ((u32)new), + "c" ((u32)(new >> 32)), "0" (old) : "memory"); return prev; diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 9596e7c6196..423ae58aa02 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -3,8 +3,6 @@ #include /* Provides LOCK_PREFIX */ -#define __xg(x) ((volatile long *)(x)) - static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; @@ -14,38 +12,51 @@ extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); /* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. */ #define __xchg(x, ptr, size) \ ({ \ __typeof(*(ptr)) __x = (x); \ switch (size) { \ case 1: \ - asm volatile("xchgb %b0,%1" \ - : "=q" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile("xchgw %w0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 4: \ - asm volatile("xchgl %k0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 8: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ default: \ __xchg_wrong_size(); \ } \ @@ -69,29 +80,41 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile(lock "cmpxchgw %w2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 4: \ - asm volatile(lock "cmpxchgl %k2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 8: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ asm volatile(lock "cmpxchgq %2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ default: \ __cmpxchg_wrong_size(); \ } \ -- cgit v1.2.3 From a5b91606bdc9d0a0d036d2d829a22921c705573e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:23:20 -0700 Subject: x86, cpu: Export AMD errata definitions Exprot the AMD errata definitions, since they are needed by kvm_amd.ko if that is built as a module. Doing "make allmodconfig" during testing would have caught this. Signed-off-by: H. Peter Anvin Cc: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> --- arch/x86/kernel/cpu/amd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 30f30dcbdb8..60a57b13082 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,9 +631,11 @@ cpu_dev_register(amd_cpu_dev); const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_400); const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_383); bool cpu_has_amd_erratum(const int *erratum) { @@ -676,3 +678,5 @@ bool cpu_has_amd_erratum(const int *erratum) return false; } + +EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); -- cgit v1.2.3 From 90c8f92f5c807807ca74d5f2f313794925174e6b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:53:49 -0700 Subject: x86, asm: Move cmpxchg emulation code to arch/x86/lib Move cmpxchg emulation code from arch/x86/kernel/cpu (which is otherwise CPU identification) to arch/x86/lib, where other emulation code lives already. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/cmpxchg.c | 72 ------------------------------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/cmpxchg.c | 72 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 73 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cmpxchg.c create mode 100644 arch/x86/lib/cmpxchg.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6..c47c43914ba 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,7 +16,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o -obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572c..00000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include -#include -#include - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f871e04b696..e10cf070ede 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o + lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c new file mode 100644 index 00000000000..2056ccf572c --- /dev/null +++ b/arch/x86/lib/cmpxchg.c @@ -0,0 +1,72 @@ +/* + * cmpxchg*() fallbacks for CPU not supporting these instructions + */ + +#include +#include +#include + +#ifndef CONFIG_X86_CMPXCHG +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) +{ + u8 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u8 *)ptr; + if (prev == old) + *(u8 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u8); + +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) +{ + u16 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u16 *)ptr; + if (prev == old) + *(u16 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u16); + +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) +{ + u32 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u32 *)ptr; + if (prev == old) + *(u32 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u32); +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + -- cgit v1.2.3 From a378d9338e8dde78314b3a6ae003de351936c729 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 17:05:11 -0700 Subject: x86, asm: Merge cmpxchg_486_u64() and cmpxchg8b_emu() We have two functions for doing exactly the same thing -- emulating cmpxchg8b on 486 and older hardware -- with different calling conventions, and yet doing the same thing. Drop the C version and use the assembly version, via alternatives, for both the local and non-local versions of cmpxchg8b. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 30 ++++++++++++++---------------- arch/x86/lib/cmpxchg.c | 18 ------------------ 2 files changed, 14 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f5bd1fd388f..284a6e8f7ce 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -246,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); - #define cmpxchg64(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ @@ -265,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); __ret; }) - -#define cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 4)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - __ret; \ -}) +#define cmpxchg64_local(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (o); \ + __typeof__(*(ptr)) __new = (n); \ + alternative_io("call cmpxchg8b_emu", \ + "cmpxchg8b (%%esi)" , \ + X86_FEATURE_CX8, \ + "=A" (__ret), \ + "S" ((ptr)), "0" (__old), \ + "b" ((unsigned int)__new), \ + "c" ((unsigned int)(__new>>32)) \ + : "memory"); \ + __ret; }) #endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c index 2056ccf572c..5d619f6df3e 100644 --- a/arch/x86/lib/cmpxchg.c +++ b/arch/x86/lib/cmpxchg.c @@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) } EXPORT_SYMBOL(cmpxchg_386_u32); #endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - -- cgit v1.2.3 From 3709c857350976408953831f0cf89d19951394a1 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Thu, 22 Jul 2010 14:57:35 +0900 Subject: x86: Ioremap: fix wrong physical address handling in PAT code The following two commits fixed a problem that x86 ioremap() doesn't handle physical address higher than 32-bit properly in X86_32 PAE mode. ffa71f33a820d1ab3f2fc5723819ac60fb76080b (x86, ioremap: Fix incorrect physical address handling in PAE mode) 35be1b716a475717611b2dc04185e9d80b9cb693 (x86, ioremap: Fix normal ram range check) But these fixes are not enough, since pat_pagerange_is_ram() in PAT code also has a same problem. This patch fixes it. Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Kenji Kaneshige LKML-Reference: <4C47DDCF.80300@jp.fujitsu.com> Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index acc15b23b74..03b48c80c65 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } -static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ram_page = 0, not_rampage = 0; unsigned long page_nr; -- cgit v1.2.3 From ca65f9fc0c447da5b270b05c41c21b19c88617c3 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Jul 2010 14:37:48 +0100 Subject: Introduce CONFIG_XEN_PVHVM compile option This patch introduce a CONFIG_XEN_PVHVM compile time option to enable/disable Xen PV on HVM support. Signed-off-by: Stefano Stabellini --- arch/x86/kernel/cpu/hypervisor.c | 2 +- arch/x86/xen/Kconfig | 5 +++++ arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 2 ++ arch/x86/xen/platform-pci-unplug.c | 2 ++ arch/x86/xen/time.c | 3 ++- 6 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 5bccedcb912..8095f8611f8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PVHVM &x86_hyper_xen_hvm, #endif }; diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index b83e119fbeb..68128a1b401 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -13,6 +13,11 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. +config XEN_PVHVM + def_bool y + depends on XEN + depends on X86_LOCAL_APIC + config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" default 8 if X86_32 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 75b479a684f..6f5345378ab 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1282,6 +1282,7 @@ void xen_hvm_init_shared_info(void) } } +#ifdef CONFIG_XEN_PVHVM static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1338,3 +1339,4 @@ const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { .init_platform = xen_hvm_guest_init, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); +#endif diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 84648c1bf13..413b19b3d0f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1942,6 +1942,7 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +#ifdef CONFIG_XEN_PVHVM static void xen_hvm_exit_mmap(struct mm_struct *mm) { struct xen_hvm_pagetable_dying a; @@ -1973,6 +1974,7 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; } +#endif #ifdef CONFIG_XEN_DEBUG_FS diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 2f7f3fb3477..554c002a1e1 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -32,6 +32,7 @@ /* store the value of xen_emul_unplug after the unplug is done */ int xen_platform_pci_unplug; EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +#ifdef CONFIG_XEN_PVHVM static int xen_emul_unplug; static int __init check_platform_magic(void) @@ -133,3 +134,4 @@ static int __init parse_xen_emul_unplug(char *arg) return 0; } early_param("xen_emul_unplug", parse_xen_emul_unplug); +#endif diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 4780e55886a..2aab4a2b910 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -516,6 +516,7 @@ __init void xen_init_time_ops(void) x86_platform.set_wallclock = xen_set_wallclock; } +#ifdef CONFIG_XEN_PVHVM static void xen_hvm_setup_cpu_clockevents(void) { int cpu = smp_processor_id(); @@ -544,4 +545,4 @@ __init void xen_hvm_init_time_ops(void) x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; } - +#endif -- cgit v1.2.3 From 73cd3b43f08cc9a9bcb168994b8e9ebd983ff573 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 15 Jun 2010 15:43:19 +0200 Subject: x86/PCI: pci, fix section mismatch pcibios_scan_specific_bus calls pci_scan_bus_on_node which is __devinit. Mark pcibios_scan_specific_bus __devinit as well since all users are now __init or __devinit. Signed-off-by: Jiri Slaby Signed-off-by: Jesse Barnes --- arch/x86/pci/legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 8d460eaf524..c89266be604 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -36,7 +36,7 @@ int __init pci_legacy_init(void) return 0; } -void pcibios_scan_specific_bus(int busn) +void __devinit pcibios_scan_specific_bus(int busn) { int devfn; long node; -- cgit v1.2.3 From 7bd1c365fd124624191d49dcc1eb9759d6017ec3 Mon Sep 17 00:00:00 2001 From: Mike Habeck Date: Wed, 12 May 2010 11:14:32 -0700 Subject: x86/PCI: Add option to not assign BAR's if not already assigned The Linux kernel assigns BARs that a BIOS did not assign, most likely to handle broken BIOSes that didn't enumerate the devices correctly. On UV the BIOS purposely doesn't assign I/O BARs for certain devices/ drivers we know don't use them (examples, LSI SAS, Qlogic FC, ...). We purposely don't assign these I/O BARs because I/O Space is a very limited resource. There is only 64k of I/O Space, and in a PCIe topology that space gets divided up into 4k chucks (this is due to the fact that a pci-to-pci bridge's I/O decoder is aligned at 4k)... Thus a system can have at most 16 cards with I/O BARs: (64k / 4k = 16) SGI needs to scale to >16 devices with I/O BARs. So by not assigning I/O BARs on devices we know don't use them, we can do that (iff the kernel doesn't go and assign these BARs that the BIOS purposely didn't assign). This patch will not assign a resource to a device BAR if that BAR was not assigned by the BIOS, and the kernel cmdline option 'pci=nobar' was specified. This patch is closely modeled after the 'pci=norom' option that currently exists in the tree. Signed-off-by: Mike Habeck Signed-off-by: Mike Travis Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index cd2a31dc5fb..49c7219826f 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -30,6 +30,7 @@ #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 +#define PCI_NOASSIGN_BARS 0x200000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 215a27ae050..a0772af64ef 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void) static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) { struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + struct resource *bar_r; + int bar; + + if (pci_probe & PCI_NOASSIGN_BARS) { + /* + * If the BIOS did not assign the BAR, zero out the + * resource so the kernel doesn't attmept to assign + * it later on in pci_assign_unassigned_resources + */ + for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { + bar_r = &dev->resource[bar]; + if (bar_r->start == 0 && bar_r->end != 0) { + bar_r->flags = 0; + bar_r->end = 0; + } + } + } if (pci_probe & PCI_NOASSIGN_ROMS) { if (rom_r->parent) @@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "norom")) { pci_probe |= PCI_NOASSIGN_ROMS; return NULL; + } else if (!strcmp(str, "nobar")) { + pci_probe |= PCI_NOASSIGN_BARS; + return NULL; } else if (!strcmp(str, "assign-busses")) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; return NULL; -- cgit v1.2.3 From 2491762cfb475dbdfa3db11ebea6de49f58b7fac Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 23 Jul 2010 12:53:27 -0600 Subject: x86/PCI: use host bridge _CRS info on ASRock ALiveSATA2-GLAN This DMI quirk turns on "pci=use_crs" for the ALiveSATA2-GLAN because amd_bus.c doesn't handle this system correctly. The system has a single HyperTransport I/O chain, but has two PCI host bridges to buses 00 and 80. amd_bus.c learns the MMIO range associated with buses 00-ff and that this range is routed to the HT chain hosted at node 0, link 0: bus: [00, ff] on node 0 link 0 bus: 00 index 1 [mem 0x80000000-0xfcffffffff] This includes the address space for both bus 00 and bus 80, and amd_bus.c assumes it's all routed to bus 00. We find device 80:01.0, which BIOS left in the middle of that space, but we don't find a bridge from bus 00 to bus 80, so we conclude that 80:01.0 is unreachable from bus 00, and we move it from the original, working, address to something outside the bus 00 aperture, which does not work: pci 0000:80:01.0: reg 10: [mem 0xfebfc000-0xfebfffff 64bit] pci 0000:80:01.0: BAR 0: assigned [mem 0xfd00000000-0xfd00003fff 64bit] The BIOS told us everything we need to know to handle this correctly, so we're better off if we just pay attention, which lets us leave the 80:01.0 device at the original, working, address: ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-7f]) pci_root PNP0A03:00: host bridge window [mem 0x80000000-0xff37ffff] ACPI: PCI Root Bridge [PCI1] (domain 0000 [bus 80-ff]) pci_root PNP0A08:00: host bridge window [mem 0xfebfc000-0xfebfffff] This was a regression between 2.6.33 and 2.6.34. In 2.6.33, amd_bus.c was used only when we found multiple HT chains. 3e3da00c01d050, which enabled amd_bus.c even on systems with a single HT chain, caused this failure. This quirk was written by Graham. If we ever enable "pci=use_crs" for machines from 2006 or earlir, this quirk should be removed. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=16007 Cc: stable@kernel.org Reported-by: Graham Ramsey Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2ec04c424a6..15466c096ba 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */ + /* 2006 AMD HT/VIA system with two host bridges */ + { + .callback = set_use_crs, + .ident = "ASRock ALiveSATA2-GLAN", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), + }, + }, {} }; -- cgit v1.2.3 From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 23 Jul 2010 14:56:28 +0100 Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc() commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove unsafe and unnecessary hardware access" changed read_msi_msg_desc() to return the last MSI message written instead of reading it from the device, since it may be called while the device is in a reduced power state. However, the pSeries platform code really does need to read messages from the device, since they are initially written by firmware. Therefore: - Restore the previous behaviour of read_msi_msg_desc() - Add new functions get_cached_msi_msg{,_desc}() which return the last MSI message written - Use the new functions where appropriate Acked-by: Michael Ellerman Signed-off-by: Ben Hutchings Signed-off-by: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26..4dc0084ec1b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); -- cgit v1.2.3 From 1f7979ac53224b0208e7d3eaeb5fd72ab9687389 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 20:04:03 +0400 Subject: x86/PCI: use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9810a0f76c9..f547ee05f71 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); /* Update IRQ for all devices with the same pirq value */ - while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { + for_each_pci_dev(dev2) { pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; @@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void) u8 pin; DBG(KERN_DEBUG "PCI: IRQ fixup\n"); - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* * If the BIOS has set an out of range IRQ number, just * ignore it. Also keep track of which IRQ's are @@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void) return; dev = NULL; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -- cgit v1.2.3 From 68f202e4e87cfab4439568bf397fcc5c7cf8d729 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 30 Jul 2010 11:46:42 -0700 Subject: x86, mtrr: Use stop machine context to rendezvous all the cpu's Use the stop machine context rather than IPI's to rendezvous all the cpus for MTRR initialization that happens during cpu bringup or for MTRR modifications during runtime. This avoids deadlock scenario (reported by Prarit) like: cpu A holds a read_lock (tasklist_lock for example) with irqs enabled cpu B waits for the same lock with irqs disabled using write_lock_irq cpu C doing set_mtrr() (during AP bringup for example), which will try to rendezvous all the cpus using IPI's This will result in C and A come to the rendezvous point and waiting for B. B is stuck forever waiting for the lock and thus not reaching the rendezvous point. Using stop cpu (run in the process context of per cpu based keventd) to do this rendezvous, avoids this deadlock scenario. Also make sure all the cpu's are in the rendezvous handler before we proceed with the local_irq_save() on each cpu. This lock step disabling irqs on all the cpus will avoid other deadlock scenarios (for example involving with the blocking smp_call_function's etc). [ This problem is very old. Marking -stable only for 2.6.35 as the stop_one_cpu_nowait() API is present only in 2.6.35. Any older kernel interested in this fix need to do some more work in backporting this patch. ] Reported-by: Prarit Bhargava Signed-off-by: Suresh Siddha LKML-Reference: <1280515602.2682.10.camel@sbsiddha-MOBL3.sc.intel.com> Acked-by: Prarit Bhargava Cc: stable@kernel.org [2.6.35] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 56 +++++++++++++++++++++++++++++++---------- arch/x86/kernel/smpboot.c | 7 ++++++ 2 files changed, 50 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b60..01c0f3ee6cc 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -35,6 +35,7 @@ #include /* FIXME: kvm_para.h needs this */ +#include #include #include #include @@ -143,22 +144,28 @@ struct set_mtrr_data { mtrr_type smp_type; }; +static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); + /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_work_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; + atomic_dec(&data->count); + while (!atomic_read(&data->gate)) + cpu_relax(); + local_irq_save(flags); atomic_dec(&data->count); - while (!atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ @@ -173,12 +180,13 @@ static void ipi_handler(void *info) } atomic_dec(&data->count); - while (atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * of CPUs. As each CPU announces that it started the rendezvous handler by + * decrementing the count, We reset data.count and set the data.gate flag + * allowing all the cpu's to proceed with the work. As each cpu disables + * interrupts, it'll decrement data.count once. We wait until it hits 0 and + * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they + * are waiting for that flag to be cleared. Once it's cleared, each * CPU goes through the transition of updating MTRRs. * The CPU vendors may each do it differently, * so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate - * to be reset. + * to be set. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * @@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ { struct set_mtrr_data data; unsigned long flags; + int cpu; + + preempt_disable(); data.smp_reg = reg; data.smp_base = base; @@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.gate, 0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + for_each_online_cpu(cpu) { + struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + + if (cpu == smp_processor_id()) + continue; + + stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); + } - local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); @@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ smp_wmb(); atomic_set(&data.gate, 1); + local_irq_save(flags); + + while (atomic_read(&data.count)) + cpu_relax(); + + /* Ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + smp_wmb(); + atomic_set(&data.gate, 0); + /* Do our MTRR business */ /* @@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate, 0); + atomic_set(&data.gate, 1); /* * Wait here for everyone to have seen the gate change @@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ cpu_relax(); local_irq_restore(flags); + preempt_enable(); } /** diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d..11015fd1abb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,13 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) -- cgit v1.2.3 From 9792db6174d9927700ed288e6d74b9391bf785d1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:42 -0700 Subject: x86, cpu: Package Level Thermal Control, Power Limit Notification definitions Add package level thermal and power limit feature support. The two MSRs and features are new starting with Intel's Sandy Bridge processor. Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification and 14.6 Package Level Thermal Management. This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and THERM_INT_HIGH_ENABLE bit. [ hpa: fixed up against current tip:x86/cpu ] Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-2-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/msr-index.h | 17 +++++++++++++++-- arch/x86/kernel/cpu/scattered.c | 2 ++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4be50ddd4d7..817aa316b18 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -166,6 +166,8 @@ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ +#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7cc4a026331..4ea2a7ca7a4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -224,12 +224,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b -#define THERM_INT_LOW_ENABLE (1 << 0) -#define THERM_INT_HIGH_ENABLE (1 << 1) +#define THERM_INT_HIGH_ENABLE (1 << 0) +#define THERM_INT_LOW_ENABLE (1 << 1) +#define THERM_INT_PLN_ENABLE (1 << 24) #define MSR_IA32_THERM_STATUS 0x0000019c #define THERM_STATUS_PROCHOT (1 << 0) +#define THERM_STATUS_POWER_LIMIT (1 << 10) #define MSR_THERM2_CTL 0x0000019d @@ -241,6 +243,17 @@ #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 + +#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) +#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) + +#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 + +#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) +#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) +#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 9815364b477..34b4dad6f0b 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -33,6 +33,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, -- cgit v1.2.3 From 25971865d48a8d0ece5307a59dbd3f06d05a7567 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 16 Jun 2010 23:19:28 -0400 Subject: x86, olpc: Use pr_debug() for EC commands Unconditionally printing EC debug messages was helpful when we were actually debugging the EC, but during normal operation it can get pretty annoying. Using pr_debug allows us finer-grained control. Signed-off-by: Andres Salomon LKML-Reference: <20100616231928.16b539f0@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/olpc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 156605281f5..f5ff3903b38 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -142,7 +142,7 @@ restart: * The OBF flag will sometimes misbehave due to what we believe * is a hardware quirk.. */ - printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); + pr_devel("olpc-ec: running cmd 0x%x\n", cmd); outb(cmd, 0x6c); if (wait_on_ibf(0x6c, 0)) { @@ -159,8 +159,7 @@ restart: " EC accept data!\n"); goto err; } - printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", - inbuf[i]); + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); outb(inbuf[i], 0x68); } } @@ -173,8 +172,7 @@ restart: goto restart; } outbuf[i] = inb(0x68); - printk(KERN_DEBUG "olpc-ec: received 0x%x\n", - outbuf[i]); + pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); } } -- cgit v1.2.3 From 54e5bc020ce1c959eaa7be18cedb734b6b13745e Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 28 Jun 2010 22:00:29 -0400 Subject: x86, olpc: Constify an olpc_ofw() arg The arguments passed to OFW shouldn't be modified; update the 'args' argument of olpc_ofw to reflect this. This saves us some later casting away of consts. Signed-off-by: Andres Salomon LKML-Reference: <20100628220029.1555ac24@debian> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/olpc_ofw.h | 2 +- arch/x86/kernel/olpc.c | 2 +- arch/x86/kernel/olpc_ofw.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 3e63d857c48..08fde475cb3 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -12,7 +12,7 @@ #define olpc_ofw(name, args, res) \ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) -extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, +extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, void **res); /* determine whether OFW is available and lives in the proper memory */ diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index f5ff3903b38..0e0cdde519b 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -188,7 +188,7 @@ static void __init platform_detect(void) { size_t propsize; __be32 rev; - void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; void *res[] = { &propsize }; if (olpc_ofw("getprop", args, res) || propsize != 4) { diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index f5d499fbe74..3218aa71ab5 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -40,7 +40,7 @@ void __init setup_olpc_ofw_pgd(void) early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, +int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, void **res) { int ofw_args[MAXARGS + 3]; -- cgit v1.2.3 From c4026cfd8febcd63dd278894108839f30e525a0e Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 30 Jul 2010 14:10:55 -0500 Subject: x86, UV: Initialize BAU hub map Fix uninitialized uvhub_mask: - An unitialized bit map variable was causing initialization of non-existant hubs (this one causes boot panics). - And the bit map was too small for large machines. This patch makes it dynamic in size. - Fix the case where socket 0 has no enabled cpu's. Don't assume every hub has a socket 0. - uv_init_per_cpu() should be __init. Signed-off-by: Cliff Wickman Cc: # for .35.x LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 59efb5390b3..312ef029281 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1484,15 +1484,16 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static void uv_init_per_cpu(int nuvhubs) +static void __init uv_init_per_cpu(int nuvhubs) { int i; int cpu; int pnode; int uvhub; + int have_hmaster; short socket = 0; unsigned short socket_mask; - unsigned int uvhub_mask; + unsigned char *uvhub_mask; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; @@ -1516,28 +1517,29 @@ static void uv_init_per_cpu(int nuvhubs) uvhub_descs = (struct uvhub_desc *) kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - uvhub_mask |= (1 << uvhub); + *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1);; + socket = (cpu_to_node(cpu) & 1); bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; } - uvhub = 0; - while (uvhub_mask) { - if (!(uvhub_mask & 1)) - goto nexthub; + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) + continue; + have_hmaster = 0; bdp = &uvhub_descs[uvhub]; socket_mask = bdp->socket_mask; socket = 0; @@ -1551,8 +1553,10 @@ static void uv_init_per_cpu(int nuvhubs) bcp->cpu = cpu; if (i == 0) { smaster = bcp; - if (socket == 0) + if (!have_hmaster) { + have_hmaster++; hmaster = bcp; + } } bcp->cpus_in_uvhub = bdp->num_cpus; bcp->cpus_in_socket = sdp->num_cpus; @@ -1566,11 +1570,9 @@ nextsocket: socket++; socket_mask = (socket_mask >> 1); } -nexthub: - uvhub++; - uvhub_mask = (uvhub_mask >> 1); } kfree(uvhub_descs); + kfree(uvhub_mask); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; -- cgit v1.2.3 From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 31 May 2010 14:28:19 +0800 Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages In common cases, guest SRAO MCE will cause corresponding poisoned page be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay the MCE to guest OS. But it is reported that if the poisoned page is accessed in guest after unmapping and before MCE is relayed to guest OS, userspace will be killed. The reason is as follows. Because poisoned page has been un-mapped, guest access will cause guest exit and kvm_mmu_page_fault will be called. kvm_mmu_page_fault can not get the poisoned page for fault address, so kernel and user space MMIO processing is tried in turn. In user MMIO processing, poisoned page is accessed again, then userspace is killed by force_sig_info. To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM and do not try kernel and user space MMIO processing for poisoned page. [xiao: fix warning introduced by avi] Reported-by: Max Asbock Signed-off-by: Huang Ying Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++-------- arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a591..b666d8d106a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return pt_write; } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ + char buf[1]; + void __user *hva; + int r; + + /* Touch the page, so send SIGBUS */ + hva = (void __user *)gfn_to_hva(kvm, gfn); + r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(kvm, gfn); + return 0; + } + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b54..c7f27779c99 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -431,11 +431,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) { - pgprintk("gfn %lx is mmio\n", walker.gfn); - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) -- cgit v1.2.3 From c332c83ae736c72dcf072e96e98a774fce39e722 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 4 May 2010 12:24:12 +0300 Subject: KVM: VMX: Simplify vmx_get_nmi_mask() !! is not needed due to the cast to bool. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe7..64252075796 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,9 +2826,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; - else - return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - GUEST_INTR_STATE_NMI); + return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -- cgit v1.2.3 From 914ebccd2d8fa439e01fe93b5229534b9e179a69 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 28 Apr 2010 18:50:36 +0900 Subject: KVM: x86: avoid unnecessary bitmap allocation when memslot is clean Although we always allocate a new dirty bitmap in x86's get_dirty_log(), it is only used as a zero-source of copy_to_user() and freed right after that when memslot is clean. This patch uses clear_user() instead of doing this unnecessary zero-source allocation. Performance improvement: as we can expect easily, the time needed to allocate a bitmap is completely reduced. In my test, the improved ioctl was about 4 to 10 times faster than the original one for clean slots. Furthermore, reducing memory allocations and copies will produce good effects to caches too. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64..1b270fd6063 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2797,7 +2797,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; - unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -2812,27 +2811,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; - memset(dirty_bitmap, 0, n); - for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; + unsigned long *dirty_bitmap; spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - goto out_free; + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) { + vfree(dirty_bitmap); + goto out; + } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; @@ -2841,13 +2843,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { + vfree(dirty_bitmap); + goto out; + } + vfree(dirty_bitmap); + } else { + r = -EFAULT; + if (clear_user(log->dirty_bitmap, n)) + goto out; } r = 0; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - r = -EFAULT; -out_free: - vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; -- cgit v1.2.3 From 08acfa187117046f8b5044b4a4cdc910f3ceeeb5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 4 May 2010 13:00:55 +0300 Subject: KVM: kvm_pdptr_read() may sleep Annotate it thusly. Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cff851cf532..d2a98f8f9af 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { + might_sleep(); /* on svm */ + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); -- cgit v1.2.3 From 1c11e713576edf33b95669be9c2dc0ff1e0c90d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 16:05:44 +0300 Subject: KVM: VMX: Avoid writing HOST_CR0 every entry cr0.ts may change between entries, so we copy cr0 to HOST_CR0 before each entry. That is slow, so instead, set HOST_CR0 to have TS set unconditionally (which is a safe value), and issue a clts() just before exiting vcpu context if the task indeed owns the fpu. Saves ~50 cycles/exit. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 +++------ arch/x86/kvm/x86.c | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64252075796..59893173425 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -812,6 +812,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); } #endif + if (current_thread_info()->status & TS_USEDFPU) + clts(); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2507,7 +2509,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ @@ -3859,11 +3861,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b270fd6063..801afc6461e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1731,8 +1731,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_put_guest_fpu(vcpu); kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); } static int is_efer_nx(void) -- cgit v1.2.3 From 9de41573675cbace09b02ef386f3e9c8739d495c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:22 +0300 Subject: KVM: x86 emulator: introduce read cache Introduce read cache which is needed for instruction that require more then one exit to userspace. After returning from userspace the instruction will be re-executed with cached read value. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 56 ++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0b2729bf207..288cbedcab1 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -186,6 +186,7 @@ struct decode_cache { unsigned long modrm_val; struct fetch_cache fetch; struct read_cache io_read; + struct read_cache mem_read; }; struct x86_emulate_ctxt { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed..776874b8e50 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1263,6 +1263,33 @@ done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static int read_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long addr, void *dest, unsigned size) +{ + int rc; + struct read_cache *mc = &ctxt->decode.mem_read; + + while (size) { + int n = min(size, 8u); + size -= n; + if (mc->pos < mc->end) + goto read_cached; + + rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + mc->end += n; + + read_cached: + memcpy(dest, mc->data + mc->pos, n); + mc->pos += n; + dest += n; + addr += n; + } + return X86EMUL_CONTINUE; +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -1504,9 +1531,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_emulated(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - dest, len, ctxt->vcpu); + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -2475,6 +2502,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int saved_dst_type = c->dst.type; ctxt->interruptibility = 0; + ctxt->decode.mem_read.pos = 0; /* Shadow copy of register state. Committed on successful emulation. * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't @@ -2529,20 +2557,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->src.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + &c->src.val, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } if (c->src2.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src2.ptr, - &c->src2.val, - c->src2.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2553,8 +2577,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, - c->dst.bytes, ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2981,7 +3005,11 @@ writeback: (rc->end != 0 && rc->end == rc->pos)) ctxt->restart = false; } - + /* + * reset read cache here in case string instruction is restared + * without decoding + */ + ctxt->decode.mem_read.end = 0; /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(ctxt->vcpu, c->eip); -- cgit v1.2.3 From 054fe9f6e3b76877516b37ac7d83d58c7f37c1b6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:23 +0300 Subject: KVM: x86 emulator: fix Move r/m16 to segment register decoding This instruction does not need generic decoding for its dst operand. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 776874b8e50..a81e6bfcade 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -171,7 +171,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - DstReg | SrcMem | ModRM | Mov, Group | Group1A, + ImplicitOps | SrcMem | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ -- cgit v1.2.3 From f0c13ef1a8f31be08bf1b1244fe4565f11f4b009 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:24 +0300 Subject: KVM: x86 emulator: cleanup xchg emulation Dst operand is already initialized during decoding stage. No need to reinitialize. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a81e6bfcade..a99d49cc893 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2804,8 +2804,8 @@ special_insn: break; } case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = c->dst.type = OP_REG; - c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.type = OP_REG; + c->src.bytes = c->op_bytes; c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; c->src.val = *(c->src.ptr); goto xchg; -- cgit v1.2.3 From b8a98945ea5b735e083eaf92906aa0ff9ece92e8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:25 +0300 Subject: KVM: x86 emulator: cleanup nop emulation Make it more explicit what we are checking for. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a99d49cc893..03a72912d7b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2799,8 +2799,8 @@ special_insn: goto done; break; case 0x90: /* nop / xchg r8,rax */ - if (!(c->rex_prefix & 1)) { /* nop */ - c->dst.type = OP_NONE; + if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { + c->dst.type = OP_NONE; /* nop */ break; } case 0x91 ... 0x97: /* xchg reg,rax */ -- cgit v1.2.3 From 414e6277fd148f6470261cef50a7fed0d88a2825 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:26 +0300 Subject: KVM: x86 emulator: handle "far address" source operand ljmp/lcall instruction operand contains address and segment. It can be 10 bytes long. Currently we decode it as two different operands. Fix it by introducing new kind of operand that can hold entire far address. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 6 +++- arch/x86/kvm/emulate.c | 56 ++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 288cbedcab1..69a64a6a36f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -143,7 +143,11 @@ struct x86_emulate_ops { struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; - unsigned long val, orig_val, *ptr; + unsigned long orig_val, *ptr; + union { + unsigned long val; + char valptr[sizeof(unsigned long) + 2]; + }; }; struct fetch_cache { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 03a72912d7b..687ea0906b7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -67,6 +67,8 @@ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ +#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ +#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -88,10 +90,6 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) -#define Src2Imm16 (4<<29) -#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be - in memory and second argument is located - immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { @@ -175,7 +173,7 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16 | No64, 0, + 0, 0, SrcImmFAddr | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -215,7 +213,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, + SrcImmFAddr | No64, SrcImmByte | ImplicitOps, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ @@ -350,7 +348,7 @@ static u32 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, + SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, @@ -576,6 +574,13 @@ static u32 group2_table[] = { (_type)_x; \ }) +#define insn_fetch_arr(_arr, _size, _eip) \ +({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ +}) + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -1160,6 +1165,17 @@ done_prefixes: c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.bytes = c->op_bytes + 2; + break; } /* @@ -1179,22 +1195,10 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; - case Src2Imm16: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 2; - c->src2.val = insn_fetch(u16, 2, c->eip); - break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; break; - case Src2Mem16: - c->src2.type = OP_MEM; - c->src2.bytes = 2; - c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); - c->src2.val = 0; - break; } /* Decode and fetch the destination operand: register or memory. */ @@ -2558,7 +2562,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->src.type == OP_MEM) { rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, - &c->src.val, c->src.bytes); + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; @@ -2884,14 +2888,18 @@ special_insn: } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ + case 0xea: { /* jmp far */ + unsigned short sel; jump_far: - if (load_segment_descriptor(ctxt, ops, c->src2.val, - VCPU_SREG_CS)) + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) goto done; - c->eip = c->src.val; + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); break; + } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); -- cgit v1.2.3 From 35aa5375d407ecadcc3adb5cb31d27044bf7f29f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:27 +0300 Subject: KVM: x86 emulator: add (set|get)_dr callbacks to x86_emulate_ops Add (set|get)_dr callbacks to x86_emulate_ops instead of calling them directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/include/asm/kvm_host.h | 4 ---- arch/x86/kvm/emulate.c | 7 +++++-- arch/x86/kvm/x86.c | 12 ++++++------ 4 files changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 69a64a6a36f..c37296d0e90 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -137,6 +137,8 @@ struct x86_emulate_ops { void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); + int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76f5483cffe..97774ae3c87 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -591,10 +591,6 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest); -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 687ea0906b7..8a4aa73ff1e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3132,7 +3132,7 @@ twobyte_insn: kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ @@ -3145,7 +3145,10 @@ twobyte_insn: kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); + + ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), + ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 801afc6461e..059d63de169 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3620,16 +3620,14 @@ int emulate_clts(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; } -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - return kvm_get_dr(ctxt->vcpu, dr, dest); + return kvm_get_dr(vcpu, dr, dest); } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - - return kvm_set_dr(ctxt->vcpu, dr, value & mask); + return kvm_set_dr(vcpu, dr, value); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3811,6 +3809,8 @@ static struct x86_emulate_ops emulate_ops = { .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, .set_rflags = emulator_set_rflags, + .get_dr = emulator_get_dr, + .set_dr = emulator_set_dr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3fb1b5dbd397d16a855c97c3fb80fe6e9196ce7c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:28 +0300 Subject: KVM: x86 emulator: add (set|get)_msr callbacks to x86_emulate_ops Add (set|get)_msr callbacks to x86_emulate_ops instead of calling them directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/kvm/emulate.c | 36 ++++++++++++++++++------------------ arch/x86/kvm/x86.c | 2 ++ 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c37296d0e90..f751657be73 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -139,6 +139,8 @@ struct x86_emulate_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a4aa73ff1e..7c8ed560fd4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1875,7 +1875,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, } static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -1890,7 +1890,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; cs.selector = (u16)(msr_data & 0xfffc); ss.selector = (u16)(msr_data + 8); @@ -1907,17 +1907,17 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - kvm_x86_ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); + ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1927,7 +1927,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) } static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) +emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -1949,7 +1949,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { @@ -1979,17 +1979,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; } static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) +emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -2012,7 +2012,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.dpl = 3; ss.dpl = 3; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs.selector = (u16)(msr_data + 16); @@ -3099,7 +3099,7 @@ twobyte_insn: } break; case 0x05: /* syscall */ - rc = emulate_syscall(ctxt); + rc = emulate_syscall(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else @@ -3155,7 +3155,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } @@ -3164,7 +3164,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } else { @@ -3175,14 +3175,14 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt); + rc = emulate_sysenter(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else goto writeback; break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt); + rc = emulate_sysexit(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 059d63de169..e3a5455049b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3811,6 +3811,8 @@ static struct x86_emulate_ops emulate_ops = { .set_rflags = emulator_set_rflags, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, + .set_msr = kvm_set_msr, + .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 5951c4423724759906b10a26aa6a8817c4afa615 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:29 +0300 Subject: KVM: x86 emulator: add get_cached_segment_base() callback to x86_emulate_ops On VMX it is expensive to call get_cached_descriptor() just to get segment base since multiple vmcs_reads are done instead of only one. Introduce new call back get_cached_segment_base() for efficiency. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 13 +------------ arch/x86/kvm/x86.c | 7 +++++++ 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index f751657be73..df53ba2294b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -132,6 +132,7 @@ struct x86_emulate_ops { int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); + unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7c8ed560fd4..8228778ace3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2097,17 +2097,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } -static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - int seg) -{ - struct desc_struct desc; - if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) - return get_desc_base(&desc); - else - return ~0; -} - static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct tss_segment_16 *tss) @@ -2383,7 +2372,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, int ret; u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); ulong old_tss_base = - get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); + ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3a5455049b..9a469df6011 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3714,6 +3714,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static unsigned long emulator_get_cached_segment_base(int seg, + struct kvm_vcpu *vcpu) +{ + return get_segment_base(vcpu, seg); +} + static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, struct kvm_vcpu *vcpu) { @@ -3804,6 +3810,7 @@ static struct x86_emulate_ops emulate_ops = { .set_cached_descriptor = emulator_set_cached_descriptor, .get_segment_selector = emulator_get_segment_selector, .set_segment_selector = emulator_set_segment_selector, + .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, -- cgit v1.2.3 From 79168fd1a307ffee46ee03b7f8711559241738c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:30 +0300 Subject: KVM: x86 emulator: cleanup some direct calls into kvm to use existing callbacks Use callbacks from x86_emulate_ops to access segments instead of calling into kvm directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 200 ++++++++++++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8228778ace3..f56ec486393 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -622,31 +622,35 @@ static void set_seg_override(struct decode_cache *c, int seg) c->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); + return ops->get_cached_segment_base(seg, ctxt->vcpu); } static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) return 0; - return seg_base(ctxt, c->seg_override); + return seg_base(ctxt, ops, c->seg_override); } -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +static unsigned long es_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - return seg_base(ctxt, VCPU_SREG_ES); + return seg_base(ctxt, ops, VCPU_SREG_ES); } -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - return seg_base(ctxt, VCPU_SREG_SS); + return seg_base(ctxt, ops, VCPU_SREG_SS); } static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, @@ -941,7 +945,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { @@ -1065,7 +1069,7 @@ done_prefixes: set_seg_override(c, VCPU_SREG_DS); if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, c); + c->modrm_ea += seg_override_base(ctxt, ops, c); if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; @@ -1161,7 +1165,7 @@ done_prefixes: c->src.type = OP_MEM; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, c), + register_address(c, seg_override_base(ctxt, ops, c), c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; @@ -1257,7 +1261,7 @@ done_prefixes: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt), + register_address(c, es_base(ctxt, ops), c->regs[VCPU_REGS_RDI]); c->dst.val = 0; break; @@ -1516,7 +1520,8 @@ exception: return X86EMUL_PROPAGATE_FAULT; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; @@ -1524,7 +1529,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), c->regs[VCPU_REGS_RSP]); } @@ -1535,7 +1540,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt), + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), c->regs[VCPU_REGS_RSP]), dest, len); if (rc != X86EMUL_CONTINUE) @@ -1588,15 +1593,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment segment; - kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - c->src.val = segment.selector; - emulate_push(ctxt); + emulate_push(ctxt, ops); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1614,7 +1618,8 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt) +static void emulate_pusha(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1624,7 +1629,7 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt) (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt); + emulate_push(ctxt, ops); ++reg; } } @@ -1726,14 +1731,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } return X86EMUL_CONTINUE; @@ -1847,39 +1852,40 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct kvm_segment *cs, struct kvm_segment *ss) + struct x86_emulate_ops *ops, struct desc_struct *cs, + struct desc_struct *ss) { - memset(cs, 0, sizeof(struct kvm_segment)); - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct kvm_segment)); + memset(cs, 0, sizeof(struct desc_struct)); + ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ - cs->base = 0; /* flat segment */ + set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ - cs->limit = 0xffffffff; /* 4GB limit */ + set_desc_limit(cs, 0xfffff); /* 4GB limit */ cs->type = 0x0b; /* Read, Execute, Accessed */ cs->s = 1; cs->dpl = 0; /* will be adjusted later */ - cs->present = 1; - cs->db = 1; + cs->p = 1; + cs->d = 1; - ss->unusable = 0; - ss->base = 0; /* flat segment */ - ss->limit = 0xffffffff; /* 4GB limit */ + set_desc_base(ss, 0); /* flat segment */ + set_desc_limit(ss, 0xfffff); /* 4GB limit */ ss->g = 1; /* 4kb granularity */ ss->s = 1; ss->type = 0x03; /* Read/Write, Accessed */ - ss->db = 1; /* 32bit stack segment */ + ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; - ss->present = 1; + ss->p = 1; } static int emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || @@ -1888,19 +1894,21 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; - cs.selector = (u16)(msr_data & 0xfffc); - ss.selector = (u16)(msr_data + 8); + cs_sel = (u16)(msr_data & 0xfffc); + ss_sel = (u16)(msr_data + 8); if (is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; if (is_long_mode(ctxt->vcpu)) { @@ -1930,8 +1938,9 @@ static int emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { @@ -1947,7 +1956,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { @@ -1966,18 +1975,20 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs.selector = (u16)msr_data; - cs.selector &= ~SELECTOR_RPL_MASK; - ss.selector = cs.selector + 8; - ss.selector &= ~SELECTOR_RPL_MASK; + cs_sel = (u16)msr_data; + cs_sel &= ~SELECTOR_RPL_MASK; + ss_sel = cs_sel + 8; + ss_sel &= ~SELECTOR_RPL_MASK; if (ctxt->mode == X86EMUL_MODE_PROT64 || is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; @@ -1992,9 +2003,10 @@ static int emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; int usermode; + u16 cs_sel, ss_sel; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || @@ -2003,7 +2015,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -2015,29 +2027,31 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: - cs.selector = (u16)(msr_data + 16); + cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = (u16)(msr_data + 24); + ss_sel = (u16)(msr_data + 24); break; case X86EMUL_MODE_PROT64: - cs.selector = (u16)(msr_data + 32); + cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = cs.selector + 8; - cs.db = 0; + ss_sel = cs_sel + 8; + cs.d = 0; cs.l = 1; break; } - cs.selector |= SELECTOR_RPL_MASK; - ss.selector |= SELECTOR_RPL_MASK; + cs_sel |= SELECTOR_RPL_MASK; + ss_sel |= SELECTOR_RPL_MASK; - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; @@ -2061,25 +2075,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - struct kvm_segment tr_seg; + struct desc_struct tr_seg; int r; u16 io_bitmap_ptr; u8 perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); - if (tr_seg.unusable) + ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + if (!tr_seg.p) return false; - if (tr_seg.limit < 103) + if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, - NULL); + r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, + ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; - if (io_bitmap_ptr + port/8 > tr_seg.limit) + if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, - ctxt->vcpu, NULL); + r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, + &perm, 1, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2445,7 +2459,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt); + emulate_push(ctxt, ops); } return ret; @@ -2588,7 +2602,7 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, VCPU_SREG_ES); + emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); @@ -2600,14 +2614,14 @@ special_insn: emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, VCPU_SREG_CS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, VCPU_SREG_SS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); @@ -2619,7 +2633,7 @@ special_insn: emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, VCPU_SREG_DS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); @@ -2649,7 +2663,7 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: @@ -2658,7 +2672,7 @@ special_insn: goto done; break; case 0x60: /* pusha */ - emulate_pusha(ctxt); + emulate_pusha(ctxt, ops); break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); @@ -2672,7 +2686,7 @@ special_insn: break; case 0x68: /* push imm */ case 0x6a: /* push imm8 */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ @@ -2752,18 +2766,13 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; - case 0x8c: { /* mov r/m, sreg */ - struct kvm_segment segreg; - - if (c->modrm_reg <= VCPU_SREG_GS) - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); - else { + case 0x8c: /* mov r/m, sreg */ + if (c->modrm_reg > VCPU_SREG_GS) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - c->dst.val = segreg.selector; + c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; - } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; @@ -2804,7 +2813,7 @@ special_insn: goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; @@ -2872,7 +2881,7 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 0xe9: /* jmp rel */ @@ -2985,11 +2994,12 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, - &c->src); + string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), + VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) - string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); + string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, + &c->dst); if (c->rep_prefix && (c->d & String)) { struct read_cache *rc = &ctxt->decode.io_read; @@ -3188,7 +3198,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, VCPU_SREG_FS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3207,7 +3217,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, VCPU_SREG_GS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); -- cgit v1.2.3 From 0f12244fe70e8a94a491f6cd7ed70a352ab6c26c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:31 +0300 Subject: KVM: x86 emulator: make set_cr() callback return error if it fails Make set_cr() callback return error if it fails instead of injecting #GP behind emulator's back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 10 ++- arch/x86/kvm/x86.c | 148 +++++++++++++++++++------------------ 3 files changed, 84 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df53ba2294b..6c4f4918db5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -135,7 +135,7 @@ struct x86_emulate_ops { unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); + int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f56ec486393..061f7d37c9f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,7 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - ops->set_cr(3, tss->cr3, ctxt->vcpu); + if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; c->regs[VCPU_REGS_RAX] = tss->eax; @@ -3135,7 +3138,10 @@ twobyte_insn: c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); + if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a469df6011..64c6e7a3141 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -414,57 +414,49 @@ out: return changed; } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr0 & 0xffffffff00000000UL) + return 1; #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!is_pae(vcpu)) + return 1; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - kvm_inject_gp(vcpu, 0); - return; - - } + if (cs_l) + return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; } kvm_x86_ops->set_cr0(vcpu, cr0); kvm_mmu_reset_context(vcpu); - return; + return 0; +} + +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (__kvm_set_cr0(vcpu, cr0)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr0); @@ -474,61 +466,56 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - if (cr4 & CR4_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr4 & CR4_RESERVED_BITS) + return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!(cr4 & X86_CR4_PAE)) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; + + if (cr4 & X86_CR4_VMXE) + return 1; - if (cr4 & X86_CR4_VMXE) { - kvm_inject_gp(vcpu, 0); - return; - } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; kvm_mmu_reset_context(vcpu); + + return 0; +} + +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (__kvm_set_cr4(vcpu, cr4)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return; + return 0; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + return 1; } /* * We don't check reserved bits in nonpae mode, because @@ -546,24 +533,34 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + return 1; + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + return 0; +} + +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (__kvm_set_cr3(vcpu, cr3)) kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } } EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr8 & CR8_RESERVED_BITS) + return 1; if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; + return 0; +} + +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (__kvm_set_cr8(vcpu, cr8)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -3681,27 +3678,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) { + int res = 0; + switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; break; case 3: - kvm_set_cr3(vcpu, val); + res = __kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: - kvm_set_cr8(vcpu, val & 0xfUL); + res = __kvm_set_cr8(vcpu, val & 0xfUL); break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + res = -1; } + + return res; } static int emulator_get_cpl(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 338dbc9781eb5acd0b12809d95d4006135f29767 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:32 +0300 Subject: KVM: x86 emulator: make (get|set)_dr() callback return error if it fails Make (get|set)_dr() callback return error if it fails instead of injecting exception behind emulator's back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++--- arch/x86/kvm/x86.c | 63 +++++++++++++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 061f7d37c9f..d5979ecc252 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3151,9 +3151,14 @@ twobyte_insn: goto done; } - ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] & - ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), - ctxt->vcpu); + if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? + ~0ULL : ~0U), ctxt->vcpu) < 0) { + /* #UD condition is already handled by the code above */ + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c6e7a3141..44a546b136f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -573,7 +573,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { case 0 ... 3: @@ -582,29 +582,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); @@ -615,28 +607,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + int res; + + res = __kvm_set_dr(vcpu, dr, val); + if (res > 0) + kvm_queue_exception(vcpu, UD_VECTOR); + else if (res < 0) + kvm_inject_gp(vcpu, 0); + + return res; +} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ case 6: *val = vcpu->arch.dr6; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; @@ -645,6 +646,15 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 0; } + +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + if (_kvm_get_dr(vcpu, dr, val)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + return 0; +} EXPORT_SYMBOL_GPL(kvm_get_dr); static inline u32 bit(int bitno) @@ -3619,12 +3629,13 @@ int emulate_clts(struct kvm_vcpu *vcpu) int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - return kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(vcpu, dr, dest); } int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - return kvm_set_dr(vcpu, dr, value); + + return __kvm_set_dr(vcpu, dr, value); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -- cgit v1.2.3 From e680080e653b8c8725ca620bf22a5f8480f40cb5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:33 +0300 Subject: KVM: x86 emulator: fix X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED values Currently X86EMUL_PROPAGATE_FAULT, X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED have the same value so caller cannot distinguish why function such as emulator_cmpxchg_emulated() (which can return both X86EMUL_PROPAGATE_FAULT and X86EMUL_CMPXCHG_FAILED) failed. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6c4f4918db5..0cf4311db0d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -51,8 +51,9 @@ struct x86_emulate_ctxt; #define X86EMUL_UNHANDLEABLE 1 /* Terminate emulation but return success to the caller. */ #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ + struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. -- cgit v1.2.3 From 411c35b7ef02aefb91e166ffeffad0891d955fcb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:34 +0300 Subject: KVM: fill in run->mmio details in (read|write)_emulated function Fill in run->mmio details in (read|write)_emulated function just like pio does. There is no point in filling only vcpu fields there just to copy them into vcpu->run a little bit later. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 44a546b136f..b976c4c1fa8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3386,9 +3386,10 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; return X86EMUL_UNHANDLEABLE; } @@ -3436,10 +3437,11 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; + memcpy(vcpu->run->mmio.data, val, bytes); return X86EMUL_CONTINUE; } @@ -3850,7 +3852,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, { int r, shadow_mask; struct decode_cache *c; - struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3937,14 +3938,6 @@ restart: return EMULATE_DO_MMIO; } - if (r || vcpu->mmio_is_write) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } - if (r) { if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) goto done; -- cgit v1.2.3 From c3cd7ffaf57ae6ead5b394cebaeb76164059a57f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:35 +0300 Subject: KVM: x86 emulator: x86_emulate_insn() return -1 only in case of emulation failure Currently emulator returns -1 when emulation failed or IO is needed. Caller tries to guess whether emulation failed by looking at other variables. Make it easier for caller to recognise error condition by always returning -1 in case of failure. For this new emulator internal return value X86EMUL_IO_NEEDED is introduced. It is used to distinguish between error condition (which returns X86EMUL_UNHANDLEABLE) and condition that requires IO exit to userspace to continue emulation. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 36 ++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0cf4311db0d..777240d4524 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -53,6 +53,7 @@ struct x86_emulate_ctxt; #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ +#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ struct x86_emulate_ops { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b976c4c1fa8..4cb65d82abc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3275,7 +3275,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3331,7 +3331,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3391,7 +3391,7 @@ mmio: vcpu->run->mmio.len = vcpu->mmio_size = bytes; vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - return X86EMUL_UNHANDLEABLE; + return X86EMUL_IO_NEEDED; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -3863,8 +3863,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); - vcpu->mmio_is_write = 0; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -3938,24 +3936,26 @@ restart: return EMULATE_DO_MMIO; } - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - goto done; - if (!vcpu->mmio_needed) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } + if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; } - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; + if (r) { /* emulation failed */ + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + + trace_kvm_emulate_insn_failed(vcpu); + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; } -done: if (vcpu->arch.exception.pending) vcpu->arch.emulate_ctxt.restart = false; -- cgit v1.2.3 From f181b96d4c769b8915849eb9070c18116fd8d44e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:36 +0300 Subject: KVM: remove export of emulator_write_emulated() It is not called directly outside of the file it's defined in anymore. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 ----- arch/x86/kvm/x86.c | 1 - 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 97774ae3c87..2ca1867ed97 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -628,11 +628,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cb65d82abc..15a4b754a45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3465,7 +3465,6 @@ int emulator_write_emulated(unsigned long addr, } return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } -EXPORT_SYMBOL_GPL(emulator_write_emulated); #define CMPXCHG_TYPE(t, ptr, old, new) \ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) -- cgit v1.2.3 From 8fe681e984b6505d4d12125c0776399304803ec7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:37 +0300 Subject: KVM: do not inject #PF in (read|write)_emulated() callbacks Return error to x86 emulator instead of injection exception behind its back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 3 +++ arch/x86/kvm/emulate.c | 12 +++++++++++- arch/x86/kvm/x86.c | 28 ++++++++++++++-------------- 3 files changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 777240d4524..b7e00cb21c6 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -94,6 +94,7 @@ struct x86_emulate_ops { int (*read_emulated)(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); /* @@ -106,6 +107,7 @@ struct x86_emulate_ops { int (*write_emulated)(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); /* @@ -120,6 +122,7 @@ struct x86_emulate_ops { const void *old, const void *new, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); int (*pio_in_emulated)(int size, unsigned short port, void *val, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d5979ecc252..d7a18a0f80a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1277,6 +1277,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, { int rc; struct read_cache *mc = &ctxt->decode.mem_read; + u32 err; while (size) { int n = min(size, 8u); @@ -1284,7 +1285,10 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu); + rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1789,6 +1793,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, { int rc; struct decode_cache *c = &ctxt->decode; + u32 err; switch (c->dst.type) { case OP_REG: @@ -1817,13 +1822,18 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.orig_val, &c->dst.val, c->dst.bytes, + &err, ctxt->vcpu); else rc = ops->write_emulated( (unsigned long)c->dst.ptr, &c->dst.val, c->dst.bytes, + &err, ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, + (unsigned long)c->dst.ptr, err); if (rc != X86EMUL_CONTINUE) return rc; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15a4b754a45..51402d8a46f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3346,10 +3346,10 @@ out: static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3359,12 +3359,10 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3409,17 +3407,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3449,6 +3445,7 @@ mmio: int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ @@ -3456,14 +3453,16 @@ int emulator_write_emulated(unsigned long addr, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, error_code, + vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, error_code, + vcpu); } #define CMPXCHG_TYPE(t, ptr, old, new) \ @@ -3480,6 +3479,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3533,7 +3533,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, vcpu); + return emulator_write_emulated(addr, new, bytes, error_code, vcpu); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -4293,7 +4293,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, vcpu); + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -- cgit v1.2.3 From 3457e4192e367fd4e0da5e9f46f9df85fa99cd11 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:38 +0300 Subject: KVM: handle emulation failure case first If emulation failed return immediately. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51402d8a46f..9e5a833f339 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3924,22 +3924,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - - if (r == 0) - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); - - if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } if (r) { /* emulation failed */ /* @@ -3955,6 +3939,21 @@ restart: return EMULATE_FAIL; } + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; + return EMULATE_DO_MMIO; + } + + if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + if (vcpu->arch.exception.pending) vcpu->arch.emulate_ctxt.restart = false; -- cgit v1.2.3 From 95c5588652f7742a21c33d9dcce0e043e057d04f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:39 +0300 Subject: KVM: x86 emulator: advance RIP outside x86 emulator code Return new RIP as part of instruction emulation result instead of updating KVM's RIP from x86 emulator code. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 ++++--- arch/x86/kvm/x86.c | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d7a18a0f80a..437f31bcffe 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2496,8 +2496,9 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, if (rc == X86EMUL_CONTINUE) { memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); rc = writeback(ctxt, ops); + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; } return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -2554,7 +2555,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { string_done: ctxt->restart = false; - kvm_rip_write(ctxt->vcpu, c->eip); + ctxt->eip = c->eip; goto done; } /* The second termination condition only applies for REPE @@ -3032,7 +3033,7 @@ writeback: ctxt->decode.mem_read.end = 0; /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); + ctxt->eip = c->eip; ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e5a833f339..8f45cc712dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3941,6 +3941,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) @@ -4945,6 +4946,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); return EMULATE_DONE; } -- cgit v1.2.3 From ef050dc0390176ec6888f373edb776587c88be3d Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:40 +0300 Subject: KVM: x86 emulator: set RFLAGS outside x86 emulator code Removes the need for set_flags() callback. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/kvm/emulate.c | 1 - arch/x86/kvm/x86.c | 7 +------ 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7e00cb21c6..a87d95f0957 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -142,7 +142,6 @@ struct x86_emulate_ops { ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 437f31bcffe..291e220c69a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3034,7 +3034,6 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); ctxt->eip = c->eip; - ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f45cc712dd..04ca343ee51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3806,11 +3806,6 @@ static void emulator_set_segment_selector(u16 sel, int seg, kvm_set_segment(vcpu, &kvm_seg, seg); } -static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - kvm_x86_ops->set_rflags(vcpu, rflags); -} - static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -3829,7 +3824,6 @@ static struct x86_emulate_ops emulate_ops = { .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, - .set_rflags = emulator_set_rflags, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, .set_msr = kvm_set_msr, @@ -3941,6 +3935,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { -- cgit v1.2.3 From bdb475a323858101f4a5ad6a1a04b1dd8885325a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:41 +0300 Subject: KVM: x86 emulator: use shadowed register in emulate_sysexit() emulate_sysexit() should use shadowed registers copy instead of looking into vcpu state directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 291e220c69a..42cb7d71ff5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2063,8 +2063,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + c->eip = c->regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; } -- cgit v1.2.3 From 4d2179e1e9cb74b25a8181a506600d96e15504fb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:42 +0300 Subject: KVM: x86 emulator: handle shadowed registers outside emulator Emulator shouldn't access vcpu directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 15 --------------- arch/x86/kvm/x86.c | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 42cb7d71ff5..97a42e8c00d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -941,12 +941,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* we cannot decode insn before we complete previous rep insn */ WARN_ON(ctxt->restart); - /* Shadow copy of register state. Committed on successful emulation. */ - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { case X86EMUL_MODE_REAL: @@ -2486,16 +2483,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); c->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) { - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); rc = writeback(ctxt, ops); if (rc == X86EMUL_CONTINUE) ctxt->eip = c->eip; @@ -2525,13 +2519,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->interruptibility = 0; ctxt->decode.mem_read.pos = 0; - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; @@ -3031,8 +3018,6 @@ writeback: * without decoding */ ctxt->decode.mem_read.end = 0; - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); ctxt->eip = c->eip; done: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04ca343ee51..21d36081a9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3844,7 +3844,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { int r, shadow_mask; - struct decode_cache *c; + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3869,13 +3869,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ - c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -3916,6 +3917,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } + /* this is needed for vmware backdor interface to work since it + changes registers values during IO operation */ + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); @@ -3936,6 +3941,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { @@ -4919,6 +4925,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l, ret; cache_all_regs(vcpu); @@ -4933,6 +4940,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, tss_selector, reason, has_error_code, @@ -4941,6 +4950,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); return EMULATE_DONE; -- cgit v1.2.3 From 95cb229530f329ec8002274891793be9c91385f7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:43 +0300 Subject: KVM: x86 emulator: move interruptibility state tracking out of emulator Emulator shouldn't access vcpu directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++----------------- arch/x86/kvm/x86.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97a42e8c00d..c40b40561df 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1843,20 +1843,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - ctxt->interruptibility = mask; -} - static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2516,7 +2502,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; - ctxt->interruptibility = 0; ctxt->decode.mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { @@ -2789,7 +2774,7 @@ special_insn: } if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); @@ -2958,7 +2943,7 @@ special_insn: if (emulator_bad_iopl(ctxt, ops)) kvm_inject_gp(ctxt->vcpu, 0); else { - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21d36081a9d..91bfe7771f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3838,12 +3838,26 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type) { - int r, shadow_mask; + int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); @@ -3871,6 +3885,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_ctxt.interruptibility = 0; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); @@ -3938,8 +3953,7 @@ restart: return EMULATE_FAIL; } - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); -- cgit v1.2.3 From 54b8486f469475d6c8e8aec917b91239a54eb8c8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:44 +0300 Subject: KVM: x86 emulator: do not inject exception directly into vcpu Return exception as a result of instruction emulation and handle injection in KVM code. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 6 ++ arch/x86/kvm/emulate.c | 124 +++++++++++++++++++++++-------------- arch/x86/kvm/x86.c | 20 +++++- 3 files changed, 100 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a87d95f0957..51cfd730ac5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -216,6 +216,12 @@ struct x86_emulate_ctxt { int interruptibility; bool restart; /* restart string instruction after writeback */ + + int exception; /* exception that happens during emulation or -1 */ + u32 error_code; /* error code for exception */ + bool error_code_valid; + unsigned long cr2; /* faulted address in case of #PF */ + /* decode cache */ struct decode_cache decode; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c40b40561df..b43ac98ef79 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -653,6 +653,37 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, return seg_base(ctxt, ops, VCPU_SREG_SS); } +static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, + u32 error, bool valid) +{ + ctxt->exception = vec; + ctxt->error_code = error; + ctxt->error_code_valid = valid; + ctxt->restart = false; +} + +static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, GP_VECTOR, err, true); +} + +static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, + int err) +{ + ctxt->cr2 = addr; + emulate_exception(ctxt, PF_VECTOR, err, true); +} + +static void emulate_ud(struct x86_emulate_ctxt *ctxt) +{ + emulate_exception(ctxt, UD_VECTOR, 0, false); +} + +static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, TS_VECTOR, err, true); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -1285,7 +1316,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1366,13 +1397,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1391,14 +1422,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1517,7 +1548,7 @@ load: ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: - kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); + emulate_exception(ctxt, err_vec, err_code, true); return X86EMUL_PROPAGATE_FAULT; } @@ -1578,7 +1609,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, break; case X86EMUL_MODE_VM86: if (iopl < 3) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } change_mask |= EFLG_IF; @@ -1829,7 +1860,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, + emulate_pf(ctxt, (unsigned long)c->dst.ptr, err); if (rc != X86EMUL_CONTINUE) return rc; @@ -1883,7 +1914,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } @@ -1937,7 +1968,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } @@ -1945,7 +1976,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } @@ -1955,13 +1986,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; @@ -2004,7 +2035,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } @@ -2022,7 +2053,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } ss_sel = (u16)(msr_data + 24); @@ -2030,7 +2061,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case X86EMUL_MODE_PROT64: cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } ss_sel = cs_sel + 8; @@ -2192,7 +2223,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2202,7 +2233,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2210,7 +2241,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2223,7 +2254,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2266,7 +2297,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, int ret; if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } c->eip = tss->eip; @@ -2334,7 +2365,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2344,7 +2375,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2352,7 +2383,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2365,7 +2396,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2399,7 +2430,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } } @@ -2408,8 +2439,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { - kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, - tss_selector & 0xfffc); + emulate_ts(ctxt, tss_selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } @@ -2505,19 +2535,19 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->decode.mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* LOCK prefix is allowed only with some instructions */ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } @@ -2679,7 +2709,7 @@ special_insn: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, @@ -2691,7 +2721,7 @@ special_insn: c->src.bytes = min(c->src.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->src.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], @@ -2754,7 +2784,7 @@ special_insn: goto mov; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); @@ -2769,7 +2799,7 @@ special_insn: if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } @@ -2895,7 +2925,7 @@ special_insn: do_io_in: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, @@ -2908,7 +2938,7 @@ special_insn: do_io_out: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, @@ -2933,7 +2963,7 @@ special_insn: break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); else { ctxt->eflags &= ~X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2941,7 +2971,7 @@ special_insn: break; case 0xfb: /* sti */ if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; @@ -3069,7 +3099,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 5: /* not defined */ - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; case 7: /* invlpg*/ emulate_invlpg(ctxt->vcpu, c->modrm_ea); @@ -3102,7 +3132,7 @@ twobyte_insn: case 1: case 5 ... 7: case 9 ... 15: - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); @@ -3111,7 +3141,7 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); @@ -3119,7 +3149,7 @@ twobyte_insn: break; case 0x22: /* mov reg, cr */ if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } c->dst.type = OP_NONE; @@ -3127,7 +3157,7 @@ twobyte_insn: case 0x23: /* mov from reg to dr */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } @@ -3135,7 +3165,7 @@ twobyte_insn: ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { /* #UD condition is already handled by the code above */ - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } @@ -3146,7 +3176,7 @@ twobyte_insn: msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } rc = X86EMUL_CONTINUE; @@ -3155,7 +3185,7 @@ twobyte_insn: case 0x32: /* rdmsr */ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91bfe7771f5..63c87adcec4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3852,6 +3852,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) kvm_x86_ops->set_interrupt_shadow(vcpu, mask); } +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + if (ctxt->exception == PF_VECTOR) + kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + else if (ctxt->error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); + else + kvm_queue_exception(vcpu, ctxt->exception); +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3886,6 +3897,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_ctxt.interruptibility = 0; + vcpu->arch.emulate_ctxt.exception = -1; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); @@ -3958,6 +3970,11 @@ restart: memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (vcpu->arch.emulate_ctxt.exception >= 0) { + inject_emulated_exception(vcpu); + return EMULATE_DONE; + } + if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; @@ -3970,9 +3987,6 @@ restart: return EMULATE_DO_MMIO; } - if (vcpu->arch.exception.pending) - vcpu->arch.emulate_ctxt.restart = false; - if (vcpu->arch.emulate_ctxt.restart) goto restart; -- cgit v1.2.3 From d94e1dc9af60e3431a586c3edfbe42d8a0d3932b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 16:54:48 +0300 Subject: KVM: Get rid of KVM_REQ_KICK KVM_REQ_KICK poisons vcpu->requests by having a bit set during normal operation. This causes the fast path check for a clear vcpu->requests to fail all the time, triggering tons of atomic operations. Fix by replacing KVM_REQ_KICK with a vcpu->guest_mode atomic. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63c87adcec4..fc5611b4007 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4604,13 +4604,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - local_irq_disable(); + atomic_set(&vcpu->guest_mode, 1); + smp_wmb(); - clear_bit(KVM_REQ_KICK, &vcpu->requests); - smp_mb__after_clear_bit(); + local_irq_disable(); - if (vcpu->requests || need_resched() || signal_pending(current)) { - set_bit(KVM_REQ_KICK, &vcpu->requests); + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + || need_resched() || signal_pending(current)) { + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); preempt_enable(); r = 1; @@ -4655,7 +4657,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - set_bit(KVM_REQ_KICK, &vcpu->requests); + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); ++vcpu->stat.exits; @@ -5580,7 +5583,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + if (atomic_xchg(&vcpu->guest_mode, 0)) smp_send_reschedule(cpu); put_cpu(); } -- cgit v1.2.3 From 3f10c846f8f9e6a32bbfeefaf7dda7ff51c7da29 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 May 2010 16:04:42 +0200 Subject: KVM: SVM: Dump vmcb contents on failed vmrun This patch adds a function to dump the vmcb into the kernel log and calls it after a failed vmrun to ease debugging. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd2..685cffff01f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2726,6 +2726,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; +void dump_vmcb(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; + + pr_err("VMCB Control Area:\n"); + pr_err("cr_read: %04x\n", control->intercept_cr_read); + pr_err("cr_write: %04x\n", control->intercept_cr_write); + pr_err("dr_read: %04x\n", control->intercept_dr_read); + pr_err("dr_write: %04x\n", control->intercept_dr_write); + pr_err("exceptions: %08x\n", control->intercept_exceptions); + pr_err("intercepts: %016llx\n", control->intercept); + pr_err("pause filter count: %d\n", control->pause_filter_count); + pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); + pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); + pr_err("tsc_offset: %016llx\n", control->tsc_offset); + pr_err("asid: %d\n", control->asid); + pr_err("tlb_ctl: %d\n", control->tlb_ctl); + pr_err("int_ctl: %08x\n", control->int_ctl); + pr_err("int_vector: %08x\n", control->int_vector); + pr_err("int_state: %08x\n", control->int_state); + pr_err("exit_code: %08x\n", control->exit_code); + pr_err("exit_info1: %016llx\n", control->exit_info_1); + pr_err("exit_info2: %016llx\n", control->exit_info_2); + pr_err("exit_int_info: %08x\n", control->exit_int_info); + pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); + pr_err("nested_ctl: %lld\n", control->nested_ctl); + pr_err("nested_cr3: %016llx\n", control->nested_cr3); + pr_err("event_inj: %08x\n", control->event_inj); + pr_err("event_inj_err: %08x\n", control->event_inj_err); + pr_err("lbr_ctl: %lld\n", control->lbr_ctl); + pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("VMCB State Save Area:\n"); + pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); + pr_err("cpl: %d efer: %016llx\n", + save->cpl, save->efer); + pr_err("cr0: %016llx cr2: %016llx\n", + save->cr0, save->cr2); + pr_err("cr3: %016llx cr4: %016llx\n", + save->cr3, save->cr4); + pr_err("dr6: %016llx dr7: %016llx\n", + save->dr6, save->dr7); + pr_err("rip: %016llx rflags: %016llx\n", + save->rip, save->rflags); + pr_err("rsp: %016llx rax: %016llx\n", + save->rsp, save->rax); + pr_err("star: %016llx lstar: %016llx\n", + save->star, save->lstar); + pr_err("cstar: %016llx sfmask: %016llx\n", + save->cstar, save->sfmask); + pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", + save->kernel_gs_base, save->sysenter_cs); + pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", + save->sysenter_esp, save->sysenter_eip); + pr_err("gpat: %016llx dbgctl: %016llx\n", + save->g_pat, save->dbgctl); + pr_err("br_from: %016llx br_to: %016llx\n", + save->br_from, save->br_to); + pr_err("excp_from: %016llx excp_to: %016llx\n", + save->last_excp_from, save->last_excp_to); + +} + static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2770,6 +2863,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + dump_vmcb(vcpu); return 0; } -- cgit v1.2.3 From eec4b140c924b4c650e9a89e01d223266490e325 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 May 2010 16:04:44 +0200 Subject: KVM: SVM: Allow EFER.LMSLE to be set with nested svm This patch enables setting of efer bit 13 which is allowed in all SVM capable processors. This is necessary for the SLES11 version of Xen 4.0 to boot with nested svm. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kvm/svm.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae431862..509a42187dc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -20,6 +20,7 @@ #define _EFER_LMA 10 /* Long mode active (read-only) */ #define _EFER_NX 11 /* No execute enable */ #define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ #define EFER_SCE (1<<_EFER_SCE) @@ -27,6 +28,7 @@ #define EFER_LMA (1<<_EFER_LMA) #define EFER_NX (1<<_EFER_NX) #define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 685cffff01f..41fe0381a1a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void) if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } for_each_possible_cpu(cpu) { -- cgit v1.2.3 From f3b8c964a9a6cfef6d3ca778648d53947b9fd257 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:09:21 +0800 Subject: KVM: MMU: mark page table dirty when a pte is actually modified Sometime cmpxchg_gpte doesn't modify gpte, in such case, don't mark page table page as dirty. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c7f27779c99..5c8ac060442 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -177,10 +177,10 @@ walk: if (!(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -217,11 +217,11 @@ walk: bool ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; } -- cgit v1.2.3 From 518c5a05e89a79e498c95c3e29f29bd236b3c972 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:58:33 +0800 Subject: KVM: MMU: Fix debug output error in walk_addr() Fix a debug output error in walk_addr Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5c8ac060442..15e379eaf3b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -229,7 +229,7 @@ walk: walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pt_access, pte_access); + __func__, (u64)pte, pte_access, pt_access); return 1; not_present: -- cgit v1.2.3 From 54a4f0239f2e98bc0842818f611a4cf73bb7dd35 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:03:49 +0800 Subject: KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed Currently, kvm_mmu_zap_page() returning the number of freed children sp. This might confuse the caller, because caller don't know the actual freed number. Let's make kvm_mmu_zap_page() return the number of pages it actually freed. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b666d8d106a..be981b1f188 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1504,6 +1504,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { + /* Count self */ + ret++; hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { @@ -1540,7 +1542,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); used_pages -= kvm_mmu_zap_page(kvm, page); - used_pages--; } kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; @@ -2941,7 +2942,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page) + 1; + return kvm_mmu_zap_page(kvm, page); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) -- cgit v1.2.3 From 6d77dbfc88e37c9efd5c5dd18445cfe819ae17ea Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 10 May 2010 11:16:56 +0300 Subject: KVM: inject #UD if instruction emulation fails and exit to userspace Do not kill VM when instruction emulation fails. Inject #UD and report failure to userspace instead. Userspace may choose to reenter guest if vcpu is in userspace (cpl == 3) in which case guest OS will kill offending process and continue running. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 5 +---- arch/x86/kvm/svm.c | 10 +++------- arch/x86/kvm/vmx.c | 28 ++++----------------------- arch/x86/kvm/x86.c | 43 +++++++++++++++++------------------------ 5 files changed, 26 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2ca1867ed97..0c06148fa3b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,7 +576,6 @@ enum emulation_result { #define EMULTYPE_SKIP (1 << 2) int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type); -void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be981b1f188..4a02dee1f2b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2814,11 +2814,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - return 0; + /* fall through */ case EMULATE_FAIL: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; return 0; default: BUG(); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41fe0381a1a..134260c36ce 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1535,7 +1535,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -2386,16 +2386,12 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int cr8_write_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 59893173425..a82cfa1e2a4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3070,7 +3070,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -3327,22 +3327,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - enum emulation_result er; - unsigned long offset; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - offset = exit_qualification & 0xffful; - - er = emulate_instruction(vcpu, 0, 0, 0); - - if (er != EMULATE_DONE) { - printk(KERN_ERR - "Fail to handle apic access vmexit! Offset is 0x%lx\n", - offset); - return -ENOEXEC; - } - return 1; + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; } static int handle_task_switch(struct kvm_vcpu *vcpu) @@ -3554,13 +3539,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - ret = 0; - goto out; - } + if (err != EMULATE_DONE) + return 0; if (signal_pending(current)) goto out; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc5611b4007..ae9d6f3e5d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3639,24 +3639,6 @@ int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) return __kvm_set_dr(vcpu, dr, value); } -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -{ - u8 opcodes[4]; - unsigned long rip = kvm_rip_read(vcpu); - unsigned long rip_linear; - - if (!printk_ratelimit()) - return; - - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); -} -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); - static u64 mk_cr_64(u64 curr_cr, u32 new_val) { return (curr_cr & ~((1ULL << 32) - 1)) | new_val; @@ -3863,6 +3845,19 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception); } +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + kvm_queue_exception(vcpu, UD_VECTOR); + return EMULATE_FAIL; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3931,11 +3926,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_SKIP) + return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } } @@ -3960,9 +3955,7 @@ restart: if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); @@ -4798,7 +4791,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r == EMULATE_DO_MMIO) { + if (r != EMULATE_DONE) { r = 0; goto out; } -- cgit v1.2.3 From f0f5933a1626c8df7b0bfd227819c66320fb4f0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:09:56 +0300 Subject: KVM: MMU: Fix free memory accounting race in mmu_alloc_roots() We drop the mmu lock between freeing memory and allocating the roots; this allows some other vcpu to sneak in and allocate memory. While the race is benign (resulting only in temporary overallocation, not oom) it is simple and easy to fix by moving the freeing close to the allocation. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a02dee1f2b..d7aebafffdf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,6 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2124,6 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2496,9 +2498,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); -- cgit v1.2.3 From 7725b89414836df492d6222b1d3cacb0ca576d77 Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:38 +0800 Subject: KVM: VMX: Define new functions to wrapper direct call of asm code Define vmcs_load() and kvm_cpu_vmxon() to avoid direct call of asm code. Also move VMXE bit operation out of kvm_cpu_vmxoff(). Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a82cfa1e2a4..82328882144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -453,6 +453,19 @@ static void vmcs_clear(struct vmcs *vmcs) vmcs, phys_addr); } +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmcs, phys_addr); +} + static void __vcpu_clear(void *arg) { struct vcpu_vmx *vmx = arg; @@ -830,7 +843,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(vmx->vmcs); u64 tsc_this, delta, new_offset; if (vcpu->cpu != cpu) { @@ -844,15 +856,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - u8 error; - per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmx->vmcs, phys_addr); + vmcs_load(vmx->vmcs); } if (vcpu->cpu != cpu) { @@ -1288,6 +1293,13 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } +static void kvm_cpu_vmxon(u64 addr) +{ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&addr), "m"(addr) + : "memory", "cc"); +} + static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); @@ -1310,9 +1322,7 @@ static int hardware_enable(void *garbage) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&phys_addr), "m"(phys_addr) - : "memory", "cc"); + kvm_cpu_vmxon(phys_addr); ept_sync_global(); @@ -1336,13 +1346,13 @@ static void vmclear_local_vcpus(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); } static void hardware_disable(void *garbage) { vmclear_local_vcpus(); kvm_cpu_vmxoff(); + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, -- cgit v1.2.3 From 92fe13be74303a7b80dc3c99e22e12a87d41bd5f Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:42 +0800 Subject: KVM: VMX: Some minor changes to code structure Do some preparations for vmm coexistence support. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82328882144..0d281dbc008 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -845,15 +845,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; - if (vcpu->cpu != cpu) { + if (vcpu->cpu != cpu) vcpu_clear(vmx); - kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); - local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); - } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { per_cpu(current_vmcs, cpu) = vmx->vmcs; @@ -864,6 +857,13 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct desc_ptr dt; unsigned long sysenter_esp; + kvm_migrate_timers(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); + vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching -- cgit v1.2.3 From b923e62e4d48bc5242b32a6ef5ba0f886137668a Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:45 +0800 Subject: KVM: VMX: VMCLEAR/VMPTRLD usage changes Originally VMCLEAR/VMPTRLD is called on vcpu migration. To support hosted VMM coexistance, VMCLEAR is executed on vcpu schedule out, and VMPTRLD is executed on vcpu schedule in. This could also eliminate the IPI when doing VMCLEAR. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0d281dbc008..9529bff0426 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -63,6 +63,9 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) #define KVM_GUEST_CR0_MASK \ @@ -845,7 +848,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; - if (vcpu->cpu != cpu) + if (vmm_exclusive && vcpu->cpu != cpu) vcpu_clear(vmx); if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -891,6 +894,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); + if (!vmm_exclusive) + __vcpu_clear(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 4610c9cc6d9c84f7d585583699f04d5f51c83671 Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:48 +0800 Subject: KVM: VMX: VMXON/VMXOFF usage changes SDM suggests VMXON should be called before VMPTRLD, and VMXOFF should be called after doing VMCLEAR. Therefore in vmm coexistence case, we should firstly call VMXON before any VMCS operation, and then call VMXOFF after the operation is done. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9529bff0426..b8aac4e9890 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -176,6 +176,8 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -847,8 +849,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - if (vmm_exclusive && vcpu->cpu != cpu) + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + else if (vcpu->cpu != cpu) vcpu_clear(vmx); if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -894,8 +899,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) + if (!vmm_exclusive) { __vcpu_clear(to_vmx(vcpu)); + kvm_cpu_vmxoff(); + } } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -1327,9 +1334,11 @@ static int hardware_enable(void *garbage) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); + if (vmm_exclusive) { + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); + } return 0; } @@ -1355,8 +1364,10 @@ static void kvm_cpu_vmxoff(void) static void hardware_disable(void *garbage) { - vmclear_local_vcpus(); - kvm_cpu_vmxoff(); + if (vmm_exclusive) { + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); + } write_cr4(read_cr4() & ~X86_CR4_VMXE); } @@ -3991,6 +4002,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vmx); } +static inline void vmcs_init(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + + vmcs_clear(vmcs); + + if (!vmm_exclusive) + kvm_cpu_vmxoff(); +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4016,7 +4040,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->vmcs) goto free_msrs; - vmcs_clear(vmx->vmcs); + vmcs_init(vmx->vmcs); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); -- cgit v1.2.3 From 24955b6c906045382b67f3e6beba7e5df4a4a045 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 12 May 2010 21:00:35 -0300 Subject: KVM: pass correct parameter to kvm_mmu_free_some_pages Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d7aebafffdf..a455c5eee37 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,7 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2125,7 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); -- cgit v1.2.3 From dfb507c41d0d12fc69820abb7f040d31fcf015fe Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Tue, 11 May 2010 22:22:40 +0300 Subject: KVM: x86 emulator: Add test acc, imm instruction (opcodes 0xA8 - 0xA9) This adds test acc, imm instruction to the x86 emulator Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b43ac98ef79..35dd57c5a7f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -181,7 +181,7 @@ static u32 opcode_table[256] = { ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ - 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, + DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ @@ -2754,6 +2754,7 @@ special_insn: } break; case 0x84 ... 0x85: + test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ @@ -2852,6 +2853,8 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); goto cmp; + case 0xa8 ... 0xa9: /* test ax, imm */ + goto test; case 0xaa ... 0xab: /* stos */ c->dst.val = c->regs[VCPU_REGS_RAX]; break; -- cgit v1.2.3 From abc190830f28a5bb678eaccb633de02ed2967d55 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 12 May 2010 01:39:21 +0300 Subject: KVM: x86 emulator: Add missing decoder flags for sub instruction This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d) Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 35dd57c5a7f..1b974f80e1e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -126,7 +126,7 @@ static u32 opcode_table[256] = { /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x30 - 0x37 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit v1.2.3 From 222b7c52c33bdef721248bfeba992af495800d30 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 12 May 2010 01:39:22 +0300 Subject: KVM: x86 emulator: Add missing decoder flags for xor instructions This adds missing decoder flags for xor instructions (opcodes 0x34 - 0x35) Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1b974f80e1e..7a36eec8bab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -130,7 +130,7 @@ static u32 opcode_table[256] = { /* 0x30 - 0x37 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit v1.2.3 From 62ad07551a2ace89e35604d1c55fdae1dd3359a8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 12 May 2010 16:40:41 +0800 Subject: KVM: x86: Clean up duplicate assignment mmu.free() already set root_hpa to INVALID_PAGE, no need to do it again in the destory_kvm_mmu(). kvm_x86_ops->set_cr4() and set_efer() already assign cr4/efer to vcpu->arch.cr4/efer, no need to do it again later. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- arch/x86/kvm/x86.c | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a455c5eee37..c075542648c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2478,10 +2478,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + /* mmu.free() should set root_hpa = INVALID_PAGE */ vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae9d6f3e5d0..03039fd8698 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -486,7 +486,7 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; + kvm_mmu_reset_context(vcpu); return 0; @@ -721,8 +721,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.efer = efer; - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); -- cgit v1.2.3 From aad827034e419fa8c5ec39e6455266f0b942d856 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 12 May 2010 16:40:42 +0800 Subject: KVM: VMX: Only reset MMU when necessary Only modifying some bits of CR0/CR4 needs paging mode switch. Modify EFER.NXE bit would result in reserved bit updates. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03039fd8698..78147f0421a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -416,6 +416,10 @@ out: static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | + X86_CR0_CD | X86_CR0_NW; + cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 @@ -449,7 +453,8 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - kvm_mmu_reset_context(vcpu); + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); return 0; } @@ -487,7 +492,8 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_x86_ops->set_cr4(vcpu, cr4); - kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & pdptr_bits) + kvm_mmu_reset_context(vcpu); return 0; } @@ -693,6 +699,8 @@ static u32 emulated_msrs[] = { static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { + u64 old_efer = vcpu->arch.efer; + if (efer & efer_reserved_bits) return 1; @@ -724,6 +732,10 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + /* Update reserved bits */ + if ((efer ^ old_efer) & EFER_NX) + kvm_mmu_reset_context(vcpu); + return 0; } -- cgit v1.2.3 From e8ad9a707496c163312bcdd6aa3b90603d45dc9b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:06:02 +0800 Subject: KVM: MMU: use proper cache object freeing function Use kmem_cache_free to free objects allocated by kmem_cache_alloc. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c075542648c..bb48b0ca5f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -305,10 +305,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, + struct kmem_cache *cache) { while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); + kmem_cache_free(cache, mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -356,10 +357,11 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -380,7 +382,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { - kfree(pc); + kmem_cache_free(pte_chain_cache, pc); } static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -391,7 +393,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { - kfree(rd); + kmem_cache_free(rmap_desc_cache, rd); } /* @@ -898,7 +900,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->link); __free_page(virt_to_page(sp->spt)); __free_page(virt_to_page(sp->gfns)); - kfree(sp); + kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } -- cgit v1.2.3 From 6d74229f013ed8e4a00d74cfa7a3fa6a2315c467 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:07:00 +0800 Subject: KVM: MMU: remove rmap before clear spte Remove rmap before clear spte otherwise it will trigger BUG_ON() in some functions such as rmap_write_protect(). Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bb48b0ca5f8..5c9d6df0113 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1813,6 +1813,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; + rmap_remove(vcpu->kvm, sptep); spte = shadow_trap_nonpresent_pte; goto set_pte; } -- cgit v1.2.3 From f55c3f419ab1f0a9d66f44ceeefe752975ae4233 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:08:08 +0800 Subject: KVM: MMU: unalias gfn before sp->gfns[] comparison in sync_page sp->gfns[] contain unaliased gfns, but gpte might contain pointer to aliased region. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15e379eaf3b..22f13797f52 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -586,7 +586,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; - gfn_t gfn = sp->gfns[i]; + gfn_t gfn; if (!is_shadow_present_pte(sp->spt[i])) continue; @@ -597,8 +597,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || - !(gpte & PT_ACCESSED_MASK)) { + gfn = gpte_to_gfn(gpte); + if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] || + !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); -- cgit v1.2.3 From 1683b2416e4c514d30ff5844a06733d0444ee000 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:09:57 +0800 Subject: KVM: x86: cleanup unused local variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kvm/x86.c: In function ‘handle_emulation_failure’: arch/x86/kvm/x86.c:3844: warning: unused variable ‘ctxt’ Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78147f0421a..b05321adfd2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3857,8 +3857,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static int handle_emulation_failure(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -- cgit v1.2.3 From 2122ff5eab8faec853e43f6de886e8dc8f31e317 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 11:25:04 +0300 Subject: KVM: move vcpu locking to dispatcher for generic vcpu ioctls All vcpu ioctls need to be locked, so instead of locking each one specifically we lock at the generic dispatcher. This patch only updates generic ioctls and leaves arch specific ioctls alone. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b05321adfd2..5acd21245fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4773,8 +4773,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - vcpu_load(vcpu); - if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -4815,14 +4813,11 @@ out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -4845,15 +4840,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -4878,8 +4869,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - vcpu_put(vcpu); - return 0; } @@ -4898,8 +4887,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; - vcpu_load(vcpu); - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -4931,26 +4918,20 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - vcpu_put(vcpu); return 0; } @@ -4996,8 +4977,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits; struct desc_ptr dt; - vcpu_load(vcpu); - dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -5057,8 +5036,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - vcpu_put(vcpu); - return 0; } @@ -5068,12 +5045,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; - vcpu_load(vcpu); - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto unlock_out; + goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5115,8 +5090,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -unlock_out: - vcpu_put(vcpu); +out: return r; } @@ -5152,7 +5126,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -5160,7 +5133,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - vcpu_put(vcpu); return 0; } @@ -5169,8 +5141,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - vcpu_load(vcpu); - memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -5180,8 +5150,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } @@ -5189,8 +5157,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - vcpu_load(vcpu); - memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; fxsave->swd = fpu->fsw; @@ -5200,8 +5166,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } -- cgit v1.2.3 From 526b78ad1a9e66ef240ad7c757988de039e42229 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 11:53:06 +0300 Subject: KVM: x86: Lock arch specific vcpu ioctls centrally Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 41 ++--------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5acd21245fc..999b017011f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1541,16 +1541,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - vcpu_load(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; srcu_read_unlock(&vcpu->kvm->srcu, idx); - vcpu_put(vcpu); - return i; } @@ -1798,7 +1794,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; - vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1816,7 +1811,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1837,11 +1831,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; - vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); return 0; out: @@ -1854,7 +1846,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; - vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1866,7 +1857,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; - vcpu_put(vcpu); return r; } @@ -2098,9 +2088,7 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); return 0; } @@ -2108,11 +2096,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); - vcpu_put(vcpu); return 0; } @@ -2124,20 +2110,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; - vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - vcpu_put(vcpu); - return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_inject_nmi(vcpu); - vcpu_put(vcpu); return 0; } @@ -2157,7 +2138,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; - vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2172,7 +2152,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: - vcpu_put(vcpu); return r; } @@ -2230,8 +2209,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2256,8 +2233,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2268,8 +2243,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; - vcpu_load(vcpu); - vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2292,22 +2265,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - vcpu_put(vcpu); - return 0; } static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - vcpu_load(vcpu); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -2316,14 +2283,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - vcpu_load(vcpu); - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; - vcpu_put(vcpu); - return 0; } @@ -2335,6 +2298,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int r; struct kvm_lapic_state *lapic = NULL; + vcpu_load(vcpu); switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; @@ -2481,9 +2445,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; - vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2534,6 +2496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + vcpu_put(vcpu); kfree(lapic); return r; } -- cgit v1.2.3 From 93736624635235cc5372ffca6d62816d02170724 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 12:35:17 +0300 Subject: KVM: Consolidate arch specific vcpu ioctl locking Now that all arch specific ioctls have centralized locking, it is easy to move it to the central dispatcher. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 999b017011f..4c2096f30d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2298,7 +2298,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int r; struct kvm_lapic_state *lapic = NULL; - vcpu_load(vcpu); switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; @@ -2496,7 +2495,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - vcpu_put(vcpu); kfree(lapic); return r; } -- cgit v1.2.3 From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:22:23 +0800 Subject: x86: Export FPU API for KVM use Also add some constants. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/i387.h | 2 ++ arch/x86/include/asm/xsave.h | 3 +++ arch/x86/kernel/i387.c | 3 ++- arch/x86/kernel/process.c | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b90..815c5b2b9f5 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src) memcpy(dst->state, src->state, xstate_size); } +extern void fpu_finit(struct fpu *fpu); + #endif /* __ASSEMBLY__ */ #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae22..29ee4e4c64c 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,6 +13,9 @@ #define FXSAVE_SIZE 512 +#define XSTATE_YMM_SIZE 256 +#define XSTATE_YMM_OFFSET (512 + 64) + /* * These are the features that the OS can handle currently. */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b3225..c4444bce846 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -107,7 +107,7 @@ void __cpuinit fpu_init(void) } #endif /* CONFIG_X86_64 */ -static void fpu_finit(struct fpu *fpu) +void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { @@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu) fp->fos = 0xffff0000u; } } +EXPORT_SYMBOL_GPL(fpu_finit); /* * The _current_ task is using the FPU for the first time diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32..ebcfcceccc7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit v1.2.3 From 7cf30855e02be7a207ffebb8b9350986f2ba83e9 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:08:27 +0800 Subject: KVM: x86: Use unlazy_fpu() for host FPU We can avoid unnecessary fpu load when userspace process didn't use FPU frequently. Derived from Avi's idea. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 18 ++---------------- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0c06148fa3b..d93601c5290 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -301,7 +301,6 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct host_fx_image; struct i387_fxsave_struct guest_fx_image; gva_t mmio_fault_cr2; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c2096f30d9..54ce77582ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -52,6 +52,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -5134,21 +5135,10 @@ void fx_init(struct kvm_vcpu *vcpu) { unsigned after_mxcsr_mask; - /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. - */ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); - /* Initialize guest FPU by resetting ours and saving into guest's */ preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_finit(); kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); preempt_enable(); vcpu->arch.cr0 |= X86_CR0_ET; @@ -5165,7 +5155,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); + unlazy_fpu(current); kvm_fx_restore(&vcpu->arch.guest_fx_image); trace_kvm_fpu(1); } @@ -5177,7 +5167,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); @@ -5203,9 +5192,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); -- cgit v1.2.3 From 98918833a3e21ffc5619535955e7a003cb788163 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:08:28 +0800 Subject: KVM: x86: Use FPU API Convert KVM to use generic FPU API. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 17 +------------- arch/x86/kvm/x86.c | 52 +++++++++++++---------------------------- 2 files changed, 17 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d93601c5290..d08bb4a202d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -301,7 +301,7 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct guest_fx_image; + struct fpu guest_fpu; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -708,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline void kvm_fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} - -static inline void kvm_fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} - -static inline void kvm_fx_finit(void) -{ - asm("finit"); -} - static inline u32 get_rdx_init_val(void) { return 0x600; /* P6 family */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54ce77582ed..84b1788489d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -53,6 +53,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -5057,27 +5058,6 @@ out: return r; } -/* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - /* * Translate a guest virtual address to a guest physical address. */ @@ -5101,7 +5081,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5117,7 +5098,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5133,22 +5115,18 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) void fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - preempt_enable(); + fpu_alloc(&vcpu->arch.guest_fpu); + fpu_finit(&vcpu->arch.guest_fpu); vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) @@ -5156,7 +5134,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 1; unlazy_fpu(current); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5166,7 +5144,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); + fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); @@ -5179,6 +5157,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5213,6 +5192,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } -- cgit v1.2.3 From 1d9dc7e000915b9607b480e34fcb4238b789fbb1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:51:24 +0800 Subject: KVM: MMU: split kvm_sync_page() function Split kvm_sync_page() into kvm_sync_page() and kvm_sync_page_transient() to clarify the code address Avi's suggestion kvm_sync_page_transient() function only update shadow page but not mark it sync and not write protect sp->gfn. it will be used by later patch Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c9d6df0113..ef5d140a270 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1199,16 +1199,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); - kvm_unlink_unsync_page(vcpu->kvm, sp); + if (clear_unsync) { + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); + } + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; @@ -1218,6 +1222,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } +static void mmu_convert_notrap(struct kvm_mmu_page *sp); +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int ret; + + ret = __kvm_sync_page(vcpu, sp, false); + if (!ret) + mmu_convert_notrap(sp); + return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + return __kvm_sync_page(vcpu, sp, true); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; -- cgit v1.2.3 From e02aa901b1aa41fb541521800cc2a4774c162485 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:52:34 +0800 Subject: KVM: MMU: don't write-protect if have new mapping to unsync page Two cases maybe happen in kvm_mmu_get_page() function: - one case is, the goal sp is already in cache, if the sp is unsync, we only need update it to assure this mapping is valid, but not mark it sync and not write-protect sp->gfn since it not broke unsync rule(one shadow page for a gfn) - another case is, the goal sp not existed, we need create a new sp for gfn, i.e, gfn (may)has another shadow page, to keep unsync rule, we should sync(mark sync and write-protect) gfn's unsync shadow page. After enabling multiple unsync shadows, we sync those shadow pages only when the new sp not allow to become unsync(also for the unsyc rule, the new rule is: allow all pte page become unsync) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ef5d140a270..064ddfbde10 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1337,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *unsync_sp = NULL; struct hlist_node *node, *tmp; role = vcpu->arch.mmu.base_role; @@ -1356,20 +1356,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { if (sp->unsync) - if (kvm_sync_page(vcpu, sp)) - continue; + unsync_sp = sp; if (sp->role.word != role.word) continue; + if (!direct && unsync_sp && + kvm_sync_page_transient(vcpu, unsync_sp)) { + unsync_sp = NULL; + break; + } + mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(sp); - } + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); + trace_kvm_mmu_get_page(sp, false); return sp; } + if (!direct && unsync_sp) + kvm_sync_page(vcpu, unsync_sp); + ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) -- cgit v1.2.3 From f78978aa3a8222f7822f15fba5dbaea990ef0887 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:53:35 +0800 Subject: KVM: MMU: only update unsync page in invlpg path Only unsync pages need updated at invlpg time since other shadow pages are write-protected Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 22f13797f52..0671d7a29c3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -461,6 +461,7 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; gpa_t pte_gpa = -1; int level; u64 *sptep; @@ -472,10 +473,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; + sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); int offset, shift; + if (!sp->unsync) + break; + shift = PAGE_SHIFT - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; offset = sp->role.quadrant << shift; @@ -493,7 +497,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) break; } - if (!is_shadow_present_pte(*sptep)) + if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } -- cgit v1.2.3 From 9fb2d2b4ff292a01ae30da003d1dc097917b0988 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 23 May 2010 14:28:26 +0300 Subject: KVM: SVM: correctly trace irq injection On SVM interrupts are injected by svm_set_irq() not svm_inject_irq(). The later is used only to wait for irq window. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 134260c36ce..f5c2b432078 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2917,9 +2917,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - trace_kvm_inj_virq(irq); - - ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -2933,6 +2930,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + ++vcpu->stat.irq_injections; + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -- cgit v1.2.3 From 221d059d15f1c8bd070a63fd45cd8d2598af5f99 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 23 May 2010 18:37:00 +0300 Subject: KVM: Update Red Hat copyrights Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/i8254.c | 1 + arch/x86/kvm/i8259.c | 1 + arch/x86/kvm/irq.c | 1 + arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/paging_tmpl.h | 1 + arch/x86/kvm/svm.c | 1 + arch/x86/kvm/timer.c | 14 ++++++++++++++ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 11 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7a36eec8bab..a4c2dcd1032 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,6 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Avi Kivity * Yaniv Kamay diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0150affad25..188d82762c1 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,6 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338..2c73f449314 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,6 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 96dfbb6ad2a..0f4e488331c 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,6 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eb7a4ae0c9..d8258a0060f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,6 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Authors: * Dor Laor diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 064ddfbde10..25d3bb2543e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0671d7a29c3..167f53357ee 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f5c2b432078..02ea5cf4b1e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,6 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 4ddadb1a5ff..564548fbb3d 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -1,3 +1,17 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * timer support + * + * Copyright 2010 Red Hat, Inc. and/or its affilates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + #include #include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8aac4e9890..9c3ffc5fde4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,6 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84b1788489d..033b9c207f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity -- cgit v1.2.3 From 9cf5cf5ad43b293581e5b87678ea5783c06d1a41 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:40:07 +0800 Subject: KVM: MMU: allow more page become unsync at gfn mapping time In current code, shadow page can become asynchronous only if one shadow page for a gfn, this rule is too strict, in fact, we can let all last mapping page(i.e, it's the pte page) become unsync, and sync them at invlpg or flush tlb time. This patch allow more page become asynchronous at gfn mapping time Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 25d3bb2543e..ba119dae890 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1170,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, return __mmu_unsync_walk(sp, pvec); } -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: found role %x\n", - __func__, sp->role.word); - return sp; - } - return NULL; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); @@ -1759,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + trace_kvm_mmu_unsync_page(sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + + kvm_mmu_mark_parents_unsync(sp); + mmu_convert_notrap(sp); +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - unsigned index; struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; + unsigned index; - index = kvm_page_table_hashfn(sp->gfn); + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.direct) + if (s->gfn != gfn || s->role.direct || s->unsync || + s->role.invalid) continue; - if (s->role.word != sp->role.word) - return 1; + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + __kvm_unsync_page(vcpu, s); } - trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; - sp->unsync = 1; - - kvm_mmu_mark_parents_unsync(sp); - - mmu_convert_notrap(sp); - return 0; } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - struct kvm_mmu_page *shadow; + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + bool need_unsync = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || s->role.direct || s->role.invalid) + continue; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (shadow->unsync) - return 0; - if (can_unsync && oos_shadow) - return kvm_unsync_page(vcpu, shadow); - return 1; + + if (!need_unsync && !s->unsync) { + if (!can_unsync || !oos_shadow) + return 1; + need_unsync = true; + } } + if (need_unsync) + kvm_unsync_pages(vcpu, gfn); return 0; } -- cgit v1.2.3 From 9f1a122f970dbef5ba3496587f39df5c1853083f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:41:33 +0800 Subject: KVM: MMU: allow more page become unsync at getting sp time Allow more page become asynchronous at getting sp time, if need create new shadow page for gfn but it not allow unsync(level > 1), we should unsync all gfn's unsync page Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba119dae890..07673487fd5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1220,6 +1220,35 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return __kvm_sync_page(vcpu, sp, true); } +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + unsigned index; + bool flush = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || !s->unsync || s->role.invalid) + continue; + + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + if ((s->role.cr4_pae != !!is_pae(vcpu)) || + (vcpu->arch.mmu.sync_page(vcpu, s))) { + kvm_mmu_zap_page(vcpu->kvm, s); + continue; + } + kvm_unlink_unsync_page(vcpu->kvm, s); + flush = true; + } + + if (flush) + kvm_mmu_flush_tlb(vcpu); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1318,8 +1347,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp, *unsync_sp = NULL; + struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; + bool need_sync = false; role = vcpu->arch.mmu.base_role; role.level = level; @@ -1336,17 +1366,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { - if (sp->unsync) - unsync_sp = sp; + if (!need_sync && sp->unsync) + need_sync = true; if (sp->role.word != role.word) continue; - if (!direct && unsync_sp && - kvm_sync_page_transient(vcpu, unsync_sp)) { - unsync_sp = NULL; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) break; - } mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { @@ -1358,9 +1385,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, false); return sp; } - if (!direct && unsync_sp) - kvm_sync_page(vcpu, unsync_sp); - ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) @@ -1371,6 +1395,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + kvm_sync_pages(vcpu, gfn); + account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) -- cgit v1.2.3 From c8174f7b35b3018c4c7b3237ed1c792e454fd5c3 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 24 May 2010 01:01:04 +0300 Subject: KVM: VMX: Add constant for invalid guest state exit reason For the sake of completeness, this patch adds a symbolic constant for VMX exit reason 0x21 (invalid guest state). Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9e6779f7cf2..104cf86a756 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -257,6 +257,7 @@ enum vmcs_field { #define EXIT_REASON_IO_INSTRUCTION 30 #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 -- cgit v1.2.3 From 2032a93d66fa282ba0f2ea9152eeff9511fa9a96 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:49:59 +0800 Subject: KVM: MMU: Don't allocate gfns page for direct mmu pages When sp->role.direct is set, sp->gfns does not contain any essential information, leaf sptes reachable from this sp are for a continuous guest physical memory range (a linear range). So sp->gfns[i] (if it was set) equals to sp->gfn + i. (PT_PAGE_TABLE_LEVEL) Obviously, it is not essential information, we can calculate it when need. It means we don't need sp->gfns when sp->role.direct=1, Thus we can save one page usage for every kvm_mmu_page. Note: Access to sp->gfns must be wrapped by kvm_mmu_page_get_gfn() or kvm_mmu_page_set_gfn(). It is only exposed in FNAME(sync_page). Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 38 +++++++++++++++++++++++++++++--------- arch/x86/kvm/paging_tmpl.h | 3 +++ 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07673487fd5..f46b6c9aff2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -397,6 +397,22 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) kmem_cache_free(rmap_desc_cache, rd); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ + if (!sp->role.direct) + return sp->gfns[index]; + + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ + if (sp->role.direct) + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); + else + sp->gfns[index] = gfn; +} + /* * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. @@ -547,7 +563,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -605,6 +621,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; pfn_t pfn; + gfn_t gfn; unsigned long *rmapp; int i; @@ -616,7 +633,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -900,7 +918,8 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); + if (!sp->role.direct) + __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } @@ -911,13 +930,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) + u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); @@ -1386,7 +1407,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte); + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; @@ -3403,7 +3424,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (*sptep & PT_WRITABLE_MASK) { rev_sp = page_header(__pa(sptep)); - gfn = rev_sp->gfns[sptep - rev_sp->spt]; + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) @@ -3417,8 +3438,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - rev_sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 167f53357ee..2ee7060a80a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -582,6 +582,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) offset = nr_present = 0; + /* direct kvm_mmu_page can not be unsync. */ + BUG_ON(sp->role.direct); + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; -- cgit v1.2.3 From c9fa0b3bef9a0b117b3c3f958ec553c21f609a9f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:48:25 +0800 Subject: KVM: MMU: Calculate correct base gfn for direct non-DIR level In Document/kvm/mmu.txt: gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. But in __direct_map(), the base gfn calculation is incorrect, it does not calculate correctly when level=3 or 4. Fix by using PT64_LVL_ADDR_MASK() which accounts for all levels correctly. Reported-by: Marcelo Tosatti Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f46b6c9aff2..c0350be52c9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2020,7 +2020,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, } if (*iterator.sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + u64 base_addr = iterator.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); -- cgit v1.2.3 From 3af1817a0d65e8c1317e8d23cfe8a91aa1d4a065 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:48:19 +0800 Subject: KVM: MMU: calculate correct gfn for small host pages backing large guest pages In Documentation/kvm/mmu.txt: gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. But in function FNAME(fetch)(), sp->gfn is incorrect when one of following situations occurred: 1) guest is 32bit paging and the guest PDE maps a 4-MByte page (backed by 4k host pages), FNAME(fetch)() miss handling the quadrant. And if guest use pse-36, "table_gfn = gpte_to_gfn(gw->ptes[level - delta]);" is incorrect. 2) guest is long mode paging and the guest PDPTE maps a 1-GByte page (backed by 4k or 2M host pages). So we fix it to suit to the document and suit to the code which requires sp->gfn correct when sp->role.direct=1. We use the goal mapping gfn(gw->gfn) to calculate the base page frame for linear translations, it is simple and easy to be understood. Reported-by: Marcelo Tosatti Reported-by: Gui Jianfeng Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2ee7060a80a..1f7f5dd8306 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -339,10 +339,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 1; if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - delta]); - /* advance table_gfn when emulating 1gb pages with 4k */ - if (delta == 0) - table_gfn += PT_INDEX(addr, level); + /* + * It is a large guest pages backed by small host pages, + * So we set @direct(@shadow_page->role.direct)=1, and + * set @table_gfn(@shadow_page->gfn)=the base page frame + * for linear translations. + */ + table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); access &= gw->pte_access; } else { direct = 0; -- cgit v1.2.3 From 01c168ac3d6568fed0373d82bd2db2b9339aab16 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Thu, 27 May 2010 16:09:48 +0800 Subject: KVM: MMU: don't check PT_WRITABLE_MASK directly Since we have is_writable_pte(), make use of it. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c0350be52c9..9f4be0114bc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2990,7 +2990,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) + if (is_writable_pte(pt[i])) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); @@ -3425,7 +3425,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_mmu_page *rev_sp; gfn_t gfn; - if (*sptep & PT_WRITABLE_MASK) { + if (is_writable_pte(*sptep)) { rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); @@ -3474,7 +3474,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) if (!(ent & PT_PRESENT_MASK)) continue; - if (!(ent & PT_WRITABLE_MASK)) + if (!is_writable_pte(ent)) continue; inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } @@ -3508,7 +3508,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) spte = rmap_next(vcpu->kvm, rmapp, NULL); while (spte) { - if (*spte & PT_WRITABLE_MASK) + if (is_writable_pte(*spte)) printk(KERN_ERR "%s: (%s) shadow page has " "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, -- cgit v1.2.3 From 6dc696d4ddf2181eefee361e1d24a49351aef1f6 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 26 May 2010 15:09:43 -1000 Subject: KVM: SVM: Fix EFER.LME being stripped Must set VCPU register to be the guest notion of EFER even if that setting is not valid on hardware. This was masked by the set in set_efer until 7657fd5ace88e8092f5f3a84117e093d7b893f26 broke that. Fix is simply to set the VCPU register before stripping bits. Signed-off-by: Zachary Amsden Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 02ea5cf4b1e..9c68a650f57 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -286,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + vcpu->arch.efer = efer; if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.efer = efer; } static int is_external_interrupt(u32 info) -- cgit v1.2.3 From 10ab25cd6bf7ee4e5a55d81f203f7dc1a855c27e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 25 May 2010 16:01:50 +0200 Subject: KVM: x86: Propagate fpu_alloc errors Memory allocation may fail. Propagate such errors. Signed-off-by: Jan Kiszka Reviewed-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 7 ++++++- arch/x86/kvm/vmx.c | 4 +++- arch/x86/kvm/x86.c | 11 +++++++++-- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d08bb4a202d..0cd0f2923af 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -624,7 +624,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -void fx_init(struct kvm_vcpu *vcpu); +int fx_init(struct kvm_vcpu *vcpu); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c68a650f57..2ae0c392329 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -904,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - fx_init(&svm->vcpu); + err = fx_init(&svm->vcpu); + if (err) + goto free_page4; + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9c3ffc5fde4..e71c731433e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,7 +2659,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - fx_init(&vmx->vcpu); + ret = fx_init(&vmx->vcpu); + if (ret != 0) + goto out; seg_setup(VCPU_SREG_CS); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 033b9c207f9..e6e0d7781af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5114,12 +5114,19 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -void fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu) { - fpu_alloc(&vcpu->arch.guest_fpu); + int err; + + err = fpu_alloc(&vcpu->arch.guest_fpu); + if (err) + return err; + fpu_finit(&vcpu->arch.guest_fpu); vcpu->arch.cr0 |= X86_CR0_ET; + + return 0; } EXPORT_SYMBOL_GPL(fx_init); -- cgit v1.2.3 From 8184dd38e22fcaec664c2b98c382b85c26780e26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 27 May 2010 14:22:51 +0300 Subject: KVM: MMU: Allow spte.w=1 for gpte.w=0 and cr0.wp=0 only in shadow mode When tdp is enabled, the guest's cr0.wp shouldn't have any effect on spte permissions. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9f4be0114bc..69d40a6e1e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1882,7 +1882,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + || (!tdp_enabled && write_fault && !is_write_protection(vcpu) + && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { -- cgit v1.2.3 From b66d80006e415ee083e59c9429911eab78047f8f Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 31 May 2010 17:11:39 +0800 Subject: KVM: MMU: Don't calculate quadrant if tdp_enabled There's no need to calculate quadrant if tdp is enabled. Signed-off-by: Gui Jianfeng Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 69d40a6e1e6..d3cd102aee2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; -- cgit v1.2.3 From 5120702e732ed72c7055f511f8dd01de36424569 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 31 May 2010 22:40:54 +0300 Subject: KVM: VMX: Properly return error to userspace on vmentry failure The vmexit handler returns KVM_EXIT_UNKNOWN since there is no handler for vmentry failures. This intercepts vmentry failures and returns KVM_FAIL_ENTRY to userspace instead. Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e71c731433e..8c9085d44c3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3665,6 +3665,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + if (unlikely(vmx->fail)) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason -- cgit v1.2.3 From 4bc9b9828150747386130ab172f7e868e1a0fc2a Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 2 Jun 2010 14:05:24 +0800 Subject: KVM: VMX: Enforce EPT pagetable level checking We only support 4 levels EPT pagetable now. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8c9085d44c3..2201e381620 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -340,6 +340,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -1568,7 +1573,8 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) { + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; } -- cgit v1.2.3 From 7bee342a9e994cce7122cb187b4f3ded9d871165 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 2 Jun 2010 17:06:03 +0800 Subject: KVM: x86: use linux/uaccess.h instead of asm/uaccess.h Should use linux/uaccess.h instead of asm/uaccess.h Signed-off-by: Lai Jiangshan Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6e0d7781af..b08c0052e33 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,13 +42,13 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS #include "trace.h" #include -#include #include #include #include -- cgit v1.2.3 From 518c8aee5ca74fc03273fc6b4893cf456d65d545 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Fri, 4 Jun 2010 08:51:39 +0800 Subject: KVM: VMX: Make sure single type invvpid is supported before issuing invvpid instruction According to SDM, we need check whether single-context INVVPID type is supported before issuing invvpid instruction. Signed-off-by: Gui Jianfeng Reviewed-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 104cf86a756..b4e28400c9f 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -376,6 +376,8 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2201e381620..94526536188 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -360,6 +360,11 @@ static inline bool cpu_has_vmx_invept_global(void) return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -504,7 +509,8 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) if (vmx->vpid == 0) return; - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } static inline void ept_sync_global(void) -- cgit v1.2.3 From 03116aa57e75b1bbe8b5e04f3cd21cdb6588c4ba Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:52:17 +0800 Subject: KVM: MMU: skip invalid sp when unprotect page In kvm_mmu_unprotect_page(), the invalid sp can be skipped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3cd102aee2..3ac51153bc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1629,7 +1629,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) bucket = &kvm->arch.mmu_page_hash[index]; restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct) { + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; -- cgit v1.2.3 From 7ae680eb2d5f0cb10ca0e6d1ff5ecb145befe8e4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:07 +0800 Subject: KVM: MMU: introduce some macros to cleanup hlist traverseing Introduce for_each_gfn_sp() and for_each_gfn_indirect_valid_sp() to cleanup hlist traverseing Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 122 +++++++++++++++++++++-------------------------------- 1 file changed, 47 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ac51153bc4..881ad918455 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1201,6 +1201,17 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ + (sp)->role.invalid) {} else + static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool clear_unsync) { @@ -1244,16 +1255,12 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) /* @gfn should be write-protected at the call site */ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; bool flush = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || !s->unsync || s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1365,9 +1372,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, u64 *parent_pte) { union kvm_mmu_page_role role; - unsigned index; unsigned quadrant; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; bool need_sync = false; @@ -1383,36 +1388,34 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) - if (sp->gfn == gfn) { - if (!need_sync && sp->unsync) - need_sync = true; + for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + if (!need_sync && sp->unsync) + need_sync = true; - if (sp->role.word != role.word) - continue; + if (sp->role.word != role.word) + continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) + break; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(sp); + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); - trace_kvm_mmu_get_page(sp, false); - return sp; - } + trace_kvm_mmu_get_page(sp, false); + return sp; + } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, bucket); + hlist_add_head(&sp->hash_link, + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -1617,46 +1620,34 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *n; int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; + } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *nn; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: zap %lx %x\n", - __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; } } @@ -1799,17 +1790,11 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; - - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->unsync || - s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); __kvm_unsync_page(vcpu, s); @@ -1819,18 +1804,11 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; bool need_unsync = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->role.invalid) - continue; - + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2703,8 +2681,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2772,13 +2748,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pte_updated = NULL; } } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) - continue; + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; -- cgit v1.2.3 From 7775834a233478ec855b97e30727248f12eafe76 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:54 +0800 Subject: KVM: MMU: split the operations of kvm_mmu_zap_page() Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to split kvm_mmu_zap_page() function, then we can: - traverse hlist safely - easily to gather remote tlb flush which occurs during page zapped Those feature can be used in the later patches Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 52 ++++++++++++++++++++++++++++++++++++++++--------- arch/x86/kvm/mmutrace.h | 2 +- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 881ad918455..9b849a70742 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -916,6 +916,7 @@ static int is_empty_shadow_page(u64 *spt) static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); + hlist_del(&sp->hash_link); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); if (!sp->role.direct) @@ -1200,6 +1201,10 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list); #define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ hlist_for_each_entry_safe(sp, pos, n, \ @@ -1530,7 +1535,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent) + struct kvm_mmu_page *parent, + struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; @@ -1544,7 +1550,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { - kvm_mmu_zap_page(kvm, sp); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } @@ -1554,16 +1560,16 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { int ret; - trace_kvm_mmu_zap_page(sp); + trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp); + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); - kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) @@ -1571,17 +1577,45 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->root_count) { /* Count self */ ret++; - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); + list_move(&sp->link, invalid_list); } else { - sp->role.invalid = 1; list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } + + sp->role.invalid = 1; kvm_mmu_reset_last_pte_updated(kvm); return ret; } +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(invalid_list)) + return; + + kvm_flush_remote_tlbs(kvm); + + do { + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_free_page(kvm, sp); + } while (!list_empty(invalid_list)); + +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + LIST_HEAD(invalid_list); + int ret; + + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + return ret; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 42f07b1bfbc..3aab0f0930e 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp) -- cgit v1.2.3 From 103ad25a86a6ec5418b3dca6a0d2bf2ba01a8318 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:54:38 +0800 Subject: KVM: MMU: don't get free page number in the loop In the later patch, we will modify sp's zapping way like below: kvm_mmu_prepare_zap_page A kvm_mmu_prepare_zap_page B kvm_mmu_prepare_zap_page C .... kvm_mmu_commit_zap_page [ zaped multiple sps only need to call kvm_mmu_commit_zap_page once ] In __kvm_mmu_free_some_pages() function, the free page number is getted form 'vcpu->kvm->arch.n_free_mmu_pages' in loop, it will hinders us to apply kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() since kvm_mmu_prepare_zap_page() not free sp. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b849a70742..1aad8e713f7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2863,13 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + int free_pages; + + free_pages = vcpu->kvm->arch.n_free_mmu_pages; + while (free_pages < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } } -- cgit v1.2.3 From d98ba053656c033180781007241f2c9d54606d56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:55:29 +0800 Subject: KVM: MMU: gather remote tlb flush which occurs during page zapped Using kvm_mmu_prepare_zap_page() and kvm_mmu_zap_page() instead of kvm_mmu_zap_page() that can reduce remote tlb flush IPI Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1aad8e713f7..44548e34697 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1200,7 +1200,6 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -1218,10 +1217,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, (sp)->role.invalid) {} else static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - bool clear_unsync) + struct list_head *invalid_list, bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1232,7 +1231,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1244,17 +1243,22 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + LIST_HEAD(invalid_list); int ret; - ret = __kvm_sync_page(vcpu, sp, false); + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); if (!ret) mmu_convert_notrap(sp); + else + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + return ret; } -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, true); + return __kvm_sync_page(vcpu, sp, invalid_list, true); } /* @gfn should be write-protected at the call site */ @@ -1262,6 +1266,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); bool flush = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { @@ -1271,13 +1276,14 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_zap_page(vcpu->kvm, s); + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } kvm_unlink_unsync_page(vcpu->kvm, s); flush = true; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) kvm_mmu_flush_tlb(vcpu); } @@ -1348,6 +1354,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; + LIST_HEAD(invalid_list); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { @@ -1360,9 +1367,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp); + kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); cond_resched_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1606,16 +1614,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - LIST_HEAD(invalid_list); - int ret; - - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(kvm, &invalid_list); - return ret; -} - /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock @@ -1623,6 +1621,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { int used_pages; + LIST_HEAD(invalid_list); used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = max(0, used_pages); @@ -1640,8 +1639,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_prepare_zap_page(kvm, page, + &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } @@ -1656,6 +1657,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); @@ -1665,9 +1667,10 @@ restart: pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; } @@ -1675,14 +1678,16 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *nn; + LIST_HEAD(invalid_list); restart: for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -2123,6 +2128,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -2132,8 +2138,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + if (!sp->root_count && sp->role.invalid) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + } vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -2146,10 +2154,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -2715,6 +2725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2801,7 +2812,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_zap_page(vcpu->kvm, sp)) + if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list)) goto restart; ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2836,6 +2848,7 @@ restart: ++spte; } } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { @@ -2864,6 +2877,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { int free_pages; + LIST_HEAD(invalid_list); free_pages = vcpu->kvm->arch.n_free_mmu_pages; while (free_pages < KVM_REFILL_PAGES && @@ -2872,9 +2886,11 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -3009,25 +3025,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page); + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) @@ -3040,6 +3059,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages, idx, freed_pages; + LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -3047,12 +3067,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.3 From f41d335a02d5132c14ec0459d3b2790eeb16fb11 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:11 +0800 Subject: KVM: MMU: traverse sp hlish safely Now, we can safely to traverse sp hlish Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44548e34697..3b75689eda9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1205,13 +1205,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else @@ -1265,11 +1265,11 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (!s->unsync) continue; @@ -1387,7 +1387,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node, *tmp; + struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1401,7 +1401,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { if (!need_sync && sp->unsync) need_sync = true; @@ -1656,19 +1656,18 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; @@ -1677,15 +1676,13 @@ restart: static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *nn; + struct hlist_node *node; LIST_HEAD(invalid_list); -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -1830,9 +1827,9 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1844,10 +1841,10 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2724,7 +2721,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; @@ -2794,8 +2791,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } -restart: - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; @@ -2812,9 +2808,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } -- cgit v1.2.3 From 0671a8e75d8aeb33e15c5152147abb0d2fa0c1e6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:59 +0800 Subject: KVM: MMU: reduce remote tlb flush in kvm_mmu_pte_write() collect remote tlb flush in kvm_mmu_pte_write() path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b75689eda9..b285449e82b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2666,11 +2666,15 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, + bool remote_flush, bool local_flush) { - if (need_remote_flush(old, new)) + if (zap_page) + return; + + if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); - else + else if (local_flush) kvm_mmu_flush_tlb(vcpu); } @@ -2735,6 +2739,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; int r; int invlpg_counter; + bool remote_flush, local_flush, zap_page; + + zap_page = remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -2808,7 +2815,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2833,16 +2840,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != sp->role.quadrant) continue; } + local_flush = true; spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); if (gentry) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); + if (!remote_flush && need_remote_flush(entry, *spte)) + remote_flush = true; ++spte; } } + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v1.2.3 From b9d762fa79f541ab480cdb733b46fdb0b4471c2d Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 7 Jun 2010 10:32:29 +0800 Subject: KVM: VMX: Add all-context INVVPID type support Add all-context INVVPID type support. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b4e28400c9f..96a5886d384 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -377,6 +377,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ +#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 94526536188..622d83b0caf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -365,6 +365,11 @@ static inline bool cpu_has_vmx_invvpid_single(void) return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -513,6 +518,20 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(struct vcpu_vmx *vmx) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_all(vmx); + else + vpid_sync_vcpu_global(); +} + static inline void ept_sync_global(void) { if (cpu_has_vmx_invept_global()) @@ -1800,7 +1819,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - vpid_sync_vcpu_all(to_vmx(vcpu)); + vpid_sync_context(to_vmx(vcpu)); if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } @@ -2756,7 +2775,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - vpid_sync_vcpu_all(vmx); + vpid_sync_context(vmx); ret = 0; -- cgit v1.2.3 From 1760dd4939a62591e492971858fac8cce1e4539e Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 7 Jun 2010 10:33:27 +0800 Subject: KVM: VMX: rename vpid_sync_vcpu_all() to vpid_sync_vcpu_single() The name "pid_sync_vcpu_all" isn't appropriate since it just affect a single vpid, so rename it to vpid_sync_vcpu_single(). Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 622d83b0caf..7d7361750bf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -509,7 +509,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx) smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); } -static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) { if (vmx->vpid == 0) return; @@ -527,7 +527,7 @@ static inline void vpid_sync_vcpu_global(void) static inline void vpid_sync_context(struct vcpu_vmx *vmx) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_all(vmx); + vpid_sync_vcpu_single(vmx); else vpid_sync_vcpu_global(); } -- cgit v1.2.3 From 4b9d3a04519fb508ad3b7ce8a7962929b2614185 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 10:15:51 +0800 Subject: KVM: VMX: fix rcu usage warning in init_rmode() fix: [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/kvm_host.h:258 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by qemu-system-x86/3796: #0: (&vcpu->mutex){+.+.+.}, at: [] vcpu_load+0x1a/0x66 [kvm] stack backtrace: Pid: 3796, comm: qemu-system-x86 Not tainted 2.6.34 #25 Call Trace: [] lockdep_rcu_dereference+0x9d/0xa5 [] gfn_to_memslot_unaliased+0x65/0xa0 [kvm] [] gfn_to_hva+0x22/0x4c [kvm] [] kvm_write_guest_page+0x2a/0x7f [kvm] [] kvm_clear_guest_page+0x1a/0x1c [kvm] [] init_rmode+0x3b/0x180 [kvm_intel] [] vmx_set_cr0+0x350/0x4d3 [kvm_intel] [] kvm_arch_vcpu_ioctl_set_sregs+0x122/0x31a [kvm] [] kvm_vcpu_ioctl+0x578/0xa3d [kvm] [] ? cpu_clock+0x2d/0x40 [] ? fget_light+0x244/0x28e [] ? trace_hardirqs_off_caller+0x1f/0x10e [] vfs_ioctl+0x32/0xa6 [] do_vfs_ioctl+0x47f/0x4b8 [] ? sub_preempt_count+0xa3/0xb7 [] ? fget_light+0x266/0x28e [] ? fget_light+0x111/0x28e [] sys_ioctl+0x47/0x6a [] system_call_fastpath+0x16/0x1b Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7d7361750bf..01b054c9813 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,21 +2659,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) static int init_rmode(struct kvm *kvm) { + int idx, ret = 0; + + idx = srcu_read_lock(&kvm->srcu); if (!init_rmode_tss(kvm)) - return 0; + goto exit; if (!init_rmode_identity_map(kvm)) - return 0; - return 1; + goto exit; + + ret = 1; +exit: + srcu_read_unlock(&kvm->srcu, idx); + return ret; } static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret, idx; + int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2783,7 +2789,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } -- cgit v1.2.3 From 4f78fd08e91c52f097d64a42d903b76fe52a3a0f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:05 +0800 Subject: KVM: MMU: remove unnecessary remote tlb flush This remote tlb flush is no necessary since we have synced while sp is zapped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b285449e82b..098a0b8616b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3040,8 +3040,6 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); - - kvm_flush_remote_tlbs(kvm); } static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, -- cgit v1.2.3 From 5304efde6ae27deeeae79b97af709d4ceecc336e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:57 +0800 Subject: KVM: MMU: use wrapper function to flush local tlb Use kvm_mmu_flush_tlb() function instead of calling kvm_x86_ops->tlb_flush(vcpu) directly. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 098a0b8616b..e087f855461 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1984,7 +1984,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, reset_host_protection)) { if (write_fault) *ptwrite = 1; - kvm_x86_ops->tlb_flush(vcpu); + kvm_mmu_flush_tlb(vcpu); } pgprintk("%s: setting spte %llx\n", __func__, *sptep); -- cgit v1.2.3 From 3b5d13218667b3ca52efa52cec1d322163bf5465 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:07:01 +0800 Subject: KVM: MMU: delay local tlb flush delay local tlb flush until enter guest moden, it can reduce vpid flush frequency and reduce remote tlb flush IPI(if KVM_REQ_TLB_FLUSH bit is already set, IPI is not sent) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e087f855461..4706a936e36 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2364,7 +2364,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); } static void paging_new_cr3(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From a24e809902339458416900869abdcc51a44bfd48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 10 Jun 2010 13:10:55 +0200 Subject: KVM: Fix unused but set warnings No real bugs in this one. Signed-off-by: Andi Kleen Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1f7f5dd8306..9308be2d5c0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -444,6 +444,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); + (void)sptep; pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); -- cgit v1.2.3 From f495c6e5e8fdc972162241df5bdff5bcebb4dc33 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:21:29 +0300 Subject: KVM: VMX: Fix incorrect rcu deref in rmode_tss_base() Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 01b054c9813..26ba61d6af8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1688,7 +1688,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) gfn_t base_gfn; slots = kvm_memslots(kvm); - base_gfn = kvm->memslots->memslots[0].base_gfn + + base_gfn = slots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } -- cgit v1.2.3 From 2acf923e38fb6a4ce0c57115decbb38d334902ac Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 10 Jun 2010 11:27:12 +0800 Subject: KVM: VMX: Enable XSAVE/XRSTOR for guest This patch enable guest to use XSAVE/XRSTOR instructions. We assume that host_xcr0 would use all possible bits that OS supported. And we loaded xcr0 in the same way we handled fpu - do it as late as we can. Signed-off-by: Dexuan Cui Signed-off-by: Sheng Yang Reviewed-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/kvm_cache_regs.h | 6 ++ arch/x86/kvm/vmx.c | 13 ++++ arch/x86/kvm/x86.c | 130 +++++++++++++++++++++++++++++++++++++--- 5 files changed, 145 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0cd0f2923af..91631b8b209 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -302,6 +302,7 @@ struct kvm_vcpu_arch { } update_pte; struct fpu guest_fpu; + u64 xcr0; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 96a5886d384..9f0cbd987d5 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -267,6 +267,7 @@ enum vmcs_field { #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 /* * Interruption-information format diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index d2a98f8f9af..6491ac8e755 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) return kvm_read_cr4_bits(vcpu, ~0UL); } +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} + #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 26ba61d6af8..864a1b6d155 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include "trace.h" @@ -3390,6 +3392,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; @@ -3668,6 +3680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b08c0052e33..b5e644701cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -65,6 +65,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -150,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +u64 __read_mostly host_xcr0; + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -474,6 +482,61 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + u64 xcr0; + + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + vcpu->arch.xcr0 = xcr0; + vcpu->guest_xcr0_loaded = 0; + return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) + return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); + } +} + int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); @@ -482,6 +545,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & CR4_RESERVED_BITS) return 1; + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -498,6 +564,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if ((cr4 ^ old_cr4) & pdptr_bits) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + return 0; } @@ -666,11 +735,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -1814,6 +1878,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -1837,6 +1902,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); return 0; out: @@ -1917,7 +1983,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | @@ -1932,7 +1998,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xb); + entry->eax = min(entry->eax, (u32)0xd); break; case 1: entry->edx &= kvm_supported_word0_x86_features; @@ -1990,6 +2056,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 0xd: { + int i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (i = 1; *nent < maxnent; ++i) { + if (entry[i - 1].eax == 0 && i != 2) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; @@ -4125,6 +4205,9 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); + if (cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + return 0; out: @@ -4523,6 +4606,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -4568,6 +4670,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); atomic_set(&vcpu->guest_mode, 1); smp_wmb(); @@ -5124,6 +5227,11 @@ int fx_init(struct kvm_vcpu *vcpu) fpu_finit(&vcpu->arch.guest_fpu); + /* + * Ensure guest xcr0 is valid for loading + */ + vcpu->arch.xcr0 = XSTATE_FP; + vcpu->arch.cr0 |= X86_CR0_ET; return 0; @@ -5140,6 +5248,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) if (vcpu->guest_fpu_loaded) return; + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; unlazy_fpu(current); fpu_restore_checking(&vcpu->arch.guest_fpu); @@ -5148,6 +5262,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + kvm_put_guest_xcr0(vcpu); + if (!vcpu->guest_fpu_loaded) return; -- cgit v1.2.3 From 49a9b07edcf4aff159c1f3d3a27e58cf38bc27cd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:14 +0300 Subject: KVM: Fix mov cr0 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 13 +++++++++++-- arch/x86/kvm/x86.c | 12 +++--------- 4 files changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 91631b8b209..b2370845021 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -597,7 +597,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code); -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2ae0c392329..6d1616d47c5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -807,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm) * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 864a1b6d155..1baf4b2d98e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3157,11 +3157,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) + kvm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; + int err; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -3172,8 +3181,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr0(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 3: kvm_set_cr3(vcpu, val); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b5e644701cc..05e9b5dde64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -425,7 +425,7 @@ out: return changed; } -static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | @@ -468,17 +468,11 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_mmu_reset_context(vcpu); return 0; } - -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (__kvm_set_cr0(vcpu, cr0)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -3732,7 +3726,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) switch (cr) { case 0: - res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; -- cgit v1.2.3 From a83b29c6ad6d6497e569edbc29e556a384cebddd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:15 +0300 Subject: KVM: Fix mov cr4 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++-------- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2370845021..ea8c319cdff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -599,7 +599,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1baf4b2d98e..f64d65dc38c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3189,8 +3189,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr4(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05e9b5dde64..ed3af15d440 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -531,7 +531,7 @@ static void update_cpuid(struct kvm_vcpu *vcpu) } } -int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; @@ -563,12 +563,6 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } - -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (__kvm_set_cr4(vcpu, cr4)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr4); static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) @@ -3735,7 +3729,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) res = __kvm_set_cr3(vcpu, val); break; case 4: - res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: res = __kvm_set_cr8(vcpu, val & 0xfUL); -- cgit v1.2.3 From 2390218b6aa2eb3784b0a82fa811c19097dc793a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:16 +0300 Subject: KVM: Fix mov cr3 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++-------- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ea8c319cdff..c2813d658f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -598,7 +598,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4706a936e36..aa98fca03ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3203,7 +3203,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6d1616d47c5..f7a6fdcf8ef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1963,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cr3 = hsave->save.cr3; svm->vcpu.arch.cr3 = hsave->save.cr3; } else { - kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); @@ -2086,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; } else - kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); /* Guest paging mode is active - reset mmu */ kvm_mmu_reset_context(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f64d65dc38c..345a3547051 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3185,8 +3185,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) complete_insn_gp(vcpu, err); return 1; case 3: - kvm_set_cr3(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr3(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 4: err = kvm_set_cr4(vcpu, val); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed3af15d440..795999e1ac1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -565,7 +565,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } EXPORT_SYMBOL_GPL(kvm_set_cr4); -static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); @@ -604,12 +604,6 @@ static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.mmu.new_cr3(vcpu); return 0; } - -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - if (__kvm_set_cr3(vcpu, cr3)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr3); int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) @@ -3726,7 +3720,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) vcpu->arch.cr2 = val; break; case 3: - res = __kvm_set_cr3(vcpu, val); + res = kvm_set_cr3(vcpu, val); break; case 4: res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); -- cgit v1.2.3 From 2d5b5a665508c60577c1088e0405850a965b6795 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Sun, 13 Jun 2010 17:29:39 +0800 Subject: KVM: x86: XSAVE/XRSTOR live migration support This patch enable save/restore of xsave state. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm.h | 22 +++++++ arch/x86/include/asm/xsave.h | 7 ++- arch/x86/kvm/x86.c | 139 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index ff90055c7f0..4d8dcbdfc12 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -22,6 +22,8 @@ #define __KVM_HAVE_XEN_HVM #define __KVM_HAVE_VCPU_EVENTS #define __KVM_HAVE_DEBUGREGS +#define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XCRS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -299,4 +301,24 @@ struct kvm_debugregs { __u64 reserved[9]; }; +/* for KVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define KVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 29ee4e4c64c..32c36668fa7 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,8 +13,11 @@ #define FXSAVE_SIZE 512 -#define XSTATE_YMM_SIZE 256 -#define XSTATE_YMM_OFFSET (512 + 64) +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* * These are the features that the OS can handle currently. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 795999e1ac1..0c8dc9614e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1680,6 +1680,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: + case KVM_CAP_XSAVE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1703,6 +1704,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; + case KVM_CAP_XCRS: + r = cpu_has_xsave; + break; default: r = 0; break; @@ -2355,6 +2359,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + sizeof(struct xsave_struct)); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + + if (cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, sizeof(struct xsave_struct)); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct i387_fxsave_struct)); + } + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2556,6 +2631,70 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } + case KVM_GET_XSAVE: { + struct kvm_xsave *xsave; + + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave); + + r = -EFAULT; + if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + struct kvm_xsave *xsave; + + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + r = -EFAULT; + if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave); + break; + } + case KVM_GET_XCRS: { + struct kvm_xcrs *xcrs; + + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs); + + r = -EFAULT; + if (copy_to_user(argp, xcrs, + sizeof(struct kvm_xcrs))) + break; + r = 0; + break; + } + case KVM_SET_XCRS: { + struct kvm_xcrs *xcrs; + + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; + + r = -EFAULT; + if (copy_from_user(xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs); + break; + } default: r = -EINVAL; } -- cgit v1.2.3 From ac3cd03cca91d481b41e8236aaa41a7f9fafa62f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:28:14 +0800 Subject: KVM: MMU: rename 'page' and 'shadow_page' to 'sp' Rename 'page' and 'shadow_page' to 'sp' to better fit the context Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9308be2d5c0..e461f2393d8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -253,7 +253,7 @@ err: return 0; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { pt_element_t gpte; @@ -264,7 +264,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { if (!is_present_gpte(gpte)) { - if (page->unsync) + if (sp->unsync) new_spte = shadow_trap_nonpresent_pte; else new_spte = shadow_notrap_nonpresent_pte; @@ -273,7 +273,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) return; pfn = vcpu->arch.update_pte.pfn; @@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, * we call mmu_set_spte() with reset_host_protection = true beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } @@ -300,7 +300,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *shadow_page; + struct kvm_mmu_page *sp; u64 spte, *sptep = NULL; int direct; gfn_t table_gfn; @@ -341,9 +341,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, access &= ~ACC_WRITE_MASK; /* * It is a large guest pages backed by small host pages, - * So we set @direct(@shadow_page->role.direct)=1, and - * set @table_gfn(@shadow_page->gfn)=the base page frame - * for linear translations. + * So we set @direct(@sp->role.direct)=1, and set + * @table_gfn(@sp->gfn)=the base page frame for linear + * translations. */ table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); access &= gw->pte_access; @@ -351,21 +351,21 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 0; table_gfn = gw->table_gfn[level - 2]; } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, direct, access, sptep); if (!direct) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); if (r || curr_pte != gw->ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); + kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); sptep = NULL; break; } } - spte = __pa(shadow_page->spt) + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *sptep = spte; -- cgit v1.2.3 From cb83cad2e7e1cdedb2abb9cef2ac076defa679d4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:29:42 +0800 Subject: KVM: MMU: cleanup for dirty page judgment Using wrap function to cleanup page dirty judgment Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e461f2393d8..efba353369e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -287,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, + is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } @@ -319,7 +319,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, - gw->ptes[gw->level-1] & PT_DIRTY_MASK, + is_dirty_gpte(gw->ptes[gw->level-1]), ptwrite, level, gw->gfn, pfn, false, true); break; -- cgit v1.2.3 From f918b443527e98476c8cc45683152106b9e4bedc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:30:36 +0800 Subject: KVM: MMU: avoid double write protected in sync page path The sync page is already write protected in mmu_sync_children(), don't write protected it again Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa98fca03ed..ff333572be7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1216,6 +1216,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else +/* @sp->gfn should be write-protected at the call site */ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list, bool clear_unsync) { @@ -1224,11 +1225,8 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - if (clear_unsync) { - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); + if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); -- cgit v1.2.3 From be71e061d15c0aad4f8c2606f76c57b8a19792fd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:31:38 +0800 Subject: KVM: MMU: don't mark pte notrap if it's just sync transient If the sync-sp just sync transient, don't mark its pte notrap Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 11 ++++------- arch/x86/kvm/paging_tmpl.h | 5 +++-- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2813d658f3..2ec2e27a403 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -241,7 +241,7 @@ struct kvm_mmu { void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); + struct kvm_mmu_page *sp, bool clear_unsync); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); hpa_t root_hpa; int root_level; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff333572be7..d1e09f3c561 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1103,7 +1103,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) + struct kvm_mmu_page *sp, bool clear_unsync) { return 1; } @@ -1228,7 +1228,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1237,7 +1237,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 0; } -static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1245,9 +1244,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, int ret; ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (!ret) - mmu_convert_notrap(sp); - else + if (ret) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); return ret; @@ -1273,7 +1270,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { + (vcpu->arch.mmu.sync_page(vcpu, s, true))) { kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index efba353369e..863920f649f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -578,7 +578,8 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * can't change unless all sptes pointing to it are nuked first. * - Alias changes zap the entire shadow cache. */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { int i, offset, nr_present; bool reset_host_protection; @@ -615,7 +616,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_gpte(gpte)) + if (is_present_gpte(gpte) || !clear_unsync) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; -- cgit v1.2.3 From ebdea638df04ae6293a9a5414d98ad843c69e82f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:32:34 +0800 Subject: KVM: MMU: cleanup for __mmu_unsync_walk() Decrease sp->unsync_children after clear unsync_child_bitmap bit Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d1e09f3c561..41e801b5306 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1160,9 +1160,11 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) + if (!ret) { __clear_bit(i, sp->unsync_child_bitmap); - else if (ret > 0) + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); + } else if (ret > 0) nr_unsync_leaf += ret; else return ret; @@ -1176,8 +1178,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, } } - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) - sp->unsync_children = 0; return nr_unsync_leaf; } -- cgit v1.2.3 From 7a8f1a74e4193d21e55b35928197486f2c047efb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:34:04 +0800 Subject: KVM: MMU: clear unsync_child_bitmap completely In current code, some page's unsync_child_bitmap is not cleared completely in mmu_sync_children(), for example, if two PDPEs shard one PDT, one of PDPE's unsync_child_bitmap is not cleared. Currently, it not harm anything just little overload, but it's the prepare work for the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 41e801b5306..ab12be4eb10 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1149,33 +1149,38 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { + struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) { - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); - } else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) + goto clear_child_bitmap; + + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + goto clear_child_bitmap; + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } else if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } else + goto clear_child_bitmap; - if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } - } + continue; + +clear_child_bitmap: + __clear_bit(i, sp->unsync_child_bitmap); + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); } -- cgit v1.2.3 From 1047df1fb682a41eb9885d6b3f2d04d6c8fd3756 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:35:15 +0800 Subject: KVM: MMU: don't walk every parent pages while mark unsync While we mark the parent's unsync_child_bitmap, if the parent is already unsynced, it no need walk it's parent, it can reduce some unnecessary workload Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 61 +++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ab12be4eb10..8c2f580956d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -175,7 +175,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -1024,7 +1024,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } - static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; @@ -1034,63 +1033,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + fn(parent_sp, sp->parent_pte); return; } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) + u64 *spte = pte_chain->parent_ptes[i]; + + if (!spte) break; - parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + parent_sp = page_header(__pa(spte)); + fn(parent_sp, spte); } } -static void kvm_mmu_update_unsync_bitmap(u64 *spte) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - unsigned int index; - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - index = spte - sp->spt; - if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) - sp->unsync_children++; - WARN_ON(!sp->unsync_children); + mmu_parent_walk(sp, mark_unsync); } -static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; + unsigned int index; - if (!sp->parent_pte) + index = spte - sp->spt; + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; - - if (!sp->multimapped) { - kvm_mmu_update_unsync_bitmap(sp->parent_pte); + if (sp->unsync_children++) return; - } - - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); - } -} - -static int unsync_walk_fn(struct kvm_mmu_page *sp) -{ - kvm_mmu_update_parents_unsync(sp); - return 1; -} - -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ - mmu_parent_walk(sp, unsync_walk_fn); - kvm_mmu_update_parents_unsync(sp); + kvm_mmu_mark_parents_unsync(sp); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From bd371396b38ffc4bd6444b0203f33b99d18cedd0 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Mon, 14 Jun 2010 11:42:15 -1000 Subject: KVM: x86: fix -DDEBUG oops Fix a slight error with assertion in local APIC code. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d8258a0060f..024f6d1c299 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -329,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); - ASSERT(!target); + ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == 0) -- cgit v1.2.3 From c37eda138473f8c843f2b4aa8da252fdfdaaafa3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 15 Jun 2010 09:03:33 +0800 Subject: KVM: x86 emulator: fix pusha instruction emulation emulate pusha instruction only writeback the last EDI register, but the other registers which need to be writeback is ignored. This patch fixed it. Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 133 +++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a4c2dcd1032..c990db0a3a0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1553,6 +1553,64 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + u32 err; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, + (unsigned long)c->dst.ptr, err); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return X86EMUL_CONTINUE; +} + static inline void emulate_push(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1651,11 +1709,12 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt, +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { @@ -1663,8 +1722,18 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt, (c->src.val = old_esp) : (c->src.val = c->regs[reg]); emulate_push(ctxt, ops); + + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + ++reg; } + + /* Disable writeback. */ + c->dst.type = OP_NONE; + + return rc; } static int emulate_popa(struct x86_emulate_ctxt *ctxt, @@ -1817,64 +1886,6 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - u32 err; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); - if (rc != X86EMUL_CONTINUE) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return X86EMUL_CONTINUE; -} - static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2689,7 +2700,9 @@ special_insn: goto done; break; case 0x60: /* pusha */ - emulate_pusha(ctxt, ops); + rc = emulate_pusha(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); -- cgit v1.2.3 From 33572ac0ad5ba5016da72e6654e607726568f9c0 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Wed, 16 Jun 2010 17:11:11 -0400 Subject: KVM: x86: Introduce a workqueue to deliver PIT timer interrupts We really want to "kvm_set_irq" during the hrtimer callback, but that is risky because that is during interrupt context. Instead, offload the work to a workqueue, which is a bit safer and should provide most of the same functionality. Signed-off-by: Chris Lalancette Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 141 +++++++++++++++++++++++++++++++-------------------- arch/x86/kvm/i8254.h | 4 +- arch/x86/kvm/irq.c | 1 - 3 files changed, 88 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 188d82762c1..467cc47fb73 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,6 +34,7 @@ #include #include +#include #include "irq.h" #include "i8254.h" @@ -244,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - raw_spin_lock(&ps->inject_lock); - if (atomic_dec_return(&ps->pit_timer.pending) < 0) + int value; + + spin_lock(&ps->inject_lock); + value = atomic_dec_return(&ps->pit_timer.pending); + if (value < 0) + /* spurious acks can be generated if, for example, the + * PIC is being reset. Handle it gracefully here + */ atomic_inc(&ps->pit_timer.pending); + else if (value > 0) + /* in this case, we had multiple outstanding pit interrupts + * that we needed to inject. Reinject + */ + queue_work(ps->pit->wq, &ps->pit->expired); ps->irq_ack = 1; - raw_spin_unlock(&ps->inject_lock); + spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -264,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_timer *pt) +static void destroy_pit_timer(struct kvm_pit *pit) { - pr_debug("execute del timer!\n"); - hrtimer_cancel(&pt->timer); + hrtimer_cancel(&pit->pit_state.pit_timer.timer); + cancel_work_sync(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -281,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; +static void pit_do_work(struct work_struct *work) +{ + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); + struct kvm *kvm = pit->kvm; + struct kvm_vcpu *vcpu; + int i; + struct kvm_kpit_state *ps = &pit->pit_state; + int inject = 0; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + spin_unlock(&ps->inject_lock); + if (inject) { + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); + } +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit *pt = ktimer->kvm->arch.vpit; + + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + queue_work(pt->wq, &pt->expired); + } + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { struct kvm_timer *pt = &ps->pit_timer; @@ -292,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); + cancel_work_sync(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; - pt->timer.function = kvm_timer_fn; + pt->timer.function = pit_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -347,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) } break; default: - destroy_pit_timer(&ps->pit_timer); + destroy_pit_timer(kvm->arch.vpit); } } @@ -626,7 +692,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - raw_spin_lock_init(&pit->pit_state.inject_lock); + spin_lock_init(&pit->pit_state.inject_lock); + + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); + if (!pit->wq) { + kfree(pit); + return NULL; + } + INIT_WORK(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -685,54 +758,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); + cancel_work_sync(&kvm->arch.vpit->expired); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); + destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } - -static void __inject_pit_timer_intr(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct kvm *kvm = vcpu->kvm; - struct kvm_kpit_state *ps; - - if (pit) { - int inject = 0; - ps = &pit->pit_state; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - raw_spin_lock(&ps->inject_lock); - if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - raw_spin_unlock(&ps->inject_lock); - if (inject) - __inject_pit_timer_intr(kvm); - } -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 900d6b0ba7c..46d08ca0b48 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - raw_spinlock_t inject_lock; + spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -40,6 +40,8 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; + struct workqueue_struct *wq; + struct work_struct expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 0f4e488331c..2095a049835 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); - kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); -- cgit v1.2.3 From e7dca5c0eba63e4ba8e3586c4b37863fd7fadb5a Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Wed, 16 Jun 2010 17:11:12 -0400 Subject: KVM: x86: Allow any LAPIC to accept PIC interrupts If the guest wants to accept timer interrupts on a CPU other than the BSP, we need to remove this gate. Signed-off-by: Chris Lalancette Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 024f6d1c299..49573c78c24 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1107,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (kvm_vcpu_is_bsp(vcpu)) { - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - } + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; return r; } -- cgit v1.2.3 From 7d5993d63f2bac75b89e171a7098044ec4bc701f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 17 Jun 2010 17:33:55 +0800 Subject: KVM: x86 emulator: fix group3 instruction decoding Group 3 instruction with ModRM reg field as 001 is defined as test instruction under AMD arch, and emulate_grp3() is ready for emulate it, so fix the decoding. static inline int emulate_grp3(...) { ... switch (c->modrm_reg) { case 0 ... 1: /* test */ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); ... } Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c990db0a3a0..abb8cec420a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -336,11 +336,11 @@ static u32 group_table[] = { [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM, 0, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = -- cgit v1.2.3 From a1a005f36e0defea7c5490772c318c6af2261d31 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Jun 2010 15:47:34 +0300 Subject: KVM: Fix xsave and xcr save/restore memory leak We allocate temporary kernel buffers for these structures, but never free them. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c8dc9614e7..d918cb15e5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2437,6 +2437,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; struct kvm_lapic_state *lapic = NULL; + struct kvm_xsave *xsave = NULL; + struct kvm_xcrs *xcrs = NULL; switch (ioctl) { case KVM_GET_LAPIC: { @@ -2632,8 +2634,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - struct kvm_xsave *xsave; - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!xsave) @@ -2648,8 +2648,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - struct kvm_xsave *xsave; - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!xsave) @@ -2663,8 +2661,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - struct kvm_xcrs *xcrs; - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!xcrs) @@ -2680,8 +2676,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XCRS: { - struct kvm_xcrs *xcrs; - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!xcrs) @@ -2700,6 +2694,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: kfree(lapic); + kfree(xsave); + kfree(xcrs); return r; } -- cgit v1.2.3 From d1ac91d8a2f00dc6a3954f7e8971339b0893edc4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Jun 2010 15:54:43 +0300 Subject: KVM: Consolidate load/save temporary buffer allocation and freeing Instead of three temporary variables and three free calls, have one temporary variable (with four names) and one free call. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d918cb15e5b..8e60b6c9c0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2436,25 +2436,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; - struct kvm_lapic_state *lapic = NULL; - struct kvm_xsave *xsave = NULL; - struct kvm_xcrs *xcrs = NULL; + union { + struct kvm_lapic_state *lapic; + struct kvm_xsave *xsave; + struct kvm_xcrs *xcrs; + void *buffer; + } u; + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2463,14 +2467,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; r = -EFAULT; - if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; r = 0; @@ -2634,68 +2638,66 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; - if (!xsave) + if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave); + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); r = -EFAULT; - if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave))) + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) break; r = 0; break; } case KVM_SET_XSAVE: { - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; - if (!xsave) + if (!u.xsave) break; r = -EFAULT; - if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave))) + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) break; - r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave); + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } case KVM_GET_XCRS: { - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; - if (!xcrs) + if (!u.xcrs) break; - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs); + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); r = -EFAULT; - if (copy_to_user(argp, xcrs, + if (copy_to_user(argp, u.xcrs, sizeof(struct kvm_xcrs))) break; r = 0; break; } case KVM_SET_XCRS: { - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; - if (!xcrs) + if (!u.xcrs) break; r = -EFAULT; - if (copy_from_user(xcrs, argp, + if (copy_from_user(u.xcrs, argp, sizeof(struct kvm_xcrs))) break; - r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs); + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } default: r = -EINVAL; } out: - kfree(lapic); - kfree(xsave); - kfree(xcrs); + kfree(u.buffer); return r; } -- cgit v1.2.3 From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 11:44:20 +0300 Subject: KVM: Remove memory alias support As advertised in feature-removal-schedule.txt. Equivalent support is provided by overlapping memory regions. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 21 ------- arch/x86/kvm/mmu.c | 17 ++---- arch/x86/kvm/paging_tmpl.h | 3 +- arch/x86/kvm/x86.c | 125 ---------------------------------------- arch/x86/kvm/x86.h | 7 --- 5 files changed, 5 insertions(+), 168 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2ec2e27a403..a57cdeacc4d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -69,8 +69,6 @@ #define IOPL_SHIFT 12 -#define KVM_ALIAS_SLOTS 4 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 @@ -362,24 +360,7 @@ struct kvm_vcpu_arch { u64 hv_vapic; }; -struct kvm_mem_alias { - gfn_t base_gfn; - unsigned long npages; - gfn_t target_gfn; -#define KVM_ALIAS_INVALID 1UL - unsigned long flags; -}; - -#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION - -struct kvm_mem_aliases { - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - int naliases; -}; - struct kvm_arch { - struct kvm_mem_aliases *aliases; - unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_alloc_mmu_pages; @@ -655,8 +636,6 @@ void kvm_disable_tdp(void); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); -struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); - static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8c2f580956d..c5501bc1010 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm, struct kvm_memory_slot *slot; int *largepage_idx; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; @@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) /* * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) @@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) if (!is_rmap_spte(*spte)) return count; - gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); @@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) u64 *spte; int i, write_protected = 0; - gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); @@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->unsync) continue; - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; spte = rmap_next(vcpu->kvm, rmapp, NULL); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 863920f649f..a21a86ef9e2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -576,7 +576,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - Alias changes zap the entire shadow cache. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool clear_unsync) @@ -611,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] || + if (gfn != sp->gfns[i] || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e60b6c9c0b..62596d373a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2740,115 +2740,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } -gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (alias->flags & KVM_ALIAS_INVALID) - continue; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) -{ - int r, n; - struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases, *old_aliases; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out; - - mutex_lock(&kvm->slots_lock); - - /* invalidate any gfn reference in case of deletion/shrinking */ - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kvm_mmu_zap_all(kvm); - kfree(old_aliases); - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out_unlock; - - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - - p = &aliases->aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; - p->flags &= ~(KVM_ALIAS_INVALID); - - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (aliases->aliases[n - 1].npages) - break; - aliases->naliases = n; - - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_aliases); - r = 0; - -out_unlock: - mutex_unlock(&kvm->slots_lock); -out: - return r; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; @@ -3056,7 +2947,6 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; - struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -3101,14 +2991,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: - r = -EFAULT; - if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); - if (r) - goto out; - break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -5559,12 +5441,6 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!kvm->arch.aliases) { - kfree(kvm); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5622,7 +5498,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); cleanup_srcu_struct(&kvm->srcu); - kfree(kvm->arch.aliases); kfree(kvm); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f4b54458285..b7a404722d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } -static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) -{ - return rcu_dereference_check(kvm->arch.aliases, - srcu_read_lock_held(&kvm->srcu) - || lockdep_is_held(&kvm->slots_lock)); -} - void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From b74a07beed0e64bfba413dcb70dd6749c57f43dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 11:48:05 +0300 Subject: KVM: Remove kernel-allocated memory regions Equivalent (and better) functionality is provided by user-allocated memory regions. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62596d373a4..9be6e4e5e8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2967,22 +2967,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) -- cgit v1.2.3 From 073d46133ab0b42154f6b8429f4f66dbe2760bda Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 17:34:34 +0300 Subject: KVM: i8259: reduce excessive abstraction for pic_irq_request() Part of the i8259 code pretends it isn't part of kvm, but we know better. Reduce excessive abstraction, eliminating callbacks and void pointers. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++---------- arch/x86/kvm/irq.h | 4 ---- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 2c73f449314..caf6e1b3d95 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,6 +34,8 @@ #include #include "trace.h" +static void pic_irq_request(struct kvm *kvm, int level); + static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { @@ -175,9 +177,9 @@ static void pic_update_irq(struct kvm_pic *s) } irq = pic_get_irq(&s->pics[0]); if (irq >= 0) - s->irq_request(s->irq_request_opaque, 1); + pic_irq_request(s->kvm, 1); else - s->irq_request(s->irq_request_opaque, 0); + pic_irq_request(s->kvm, 0); } void kvm_pic_update_irq(struct kvm_pic *s) @@ -262,8 +264,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { int irq; - struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; + struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; u8 irr = s->irr, isr = s->imr; s->last_irr = 0; @@ -302,8 +303,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) /* * deassert a pending interrupt */ - s->pics_state->irq_request(s->pics_state-> - irq_request_opaque, 0); + pic_irq_request(s->pics_state->kvm, 0); s->init_state = 1; s->init4 = val & 1; if (val & 0x02) @@ -519,9 +519,8 @@ static int picdev_read(struct kvm_io_device *this, /* * callback when PIC0 irq status changed */ -static void pic_irq_request(void *opaque, int level) +static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); @@ -550,8 +549,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; - s->irq_request = pic_irq_request; - s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413..ffed06871c5 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -38,8 +38,6 @@ struct kvm; struct kvm_vcpu; -typedef void irq_request_func(void *opaque, int level); - struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ @@ -67,8 +65,6 @@ struct kvm_pic { unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); -- cgit v1.2.3 From 36633f32ba4c238403d19584754b30fe469d6dcb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 17:38:06 +0300 Subject: KVM: i8259: simplify pic_irq_request() calling sequence Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index caf6e1b3d95..bc10f0bd381 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -176,10 +176,7 @@ static void pic_update_irq(struct kvm_pic *s) pic_set_irq1(&s->pics[0], 2, 0); } irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) - pic_irq_request(s->kvm, 1); - else - pic_irq_request(s->kvm, 0); + pic_irq_request(s->kvm, irq >= 0); } void kvm_pic_update_irq(struct kvm_pic *s) -- cgit v1.2.3 From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:34:53 +0300 Subject: KVM: Add mini-API for vcpu->requests Makes it a little more readable and hackable. Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 27 +++++++++++++-------------- 6 files changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 49573c78c24..77d8c0f4817 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -534,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); + kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c5501bc1010..690a7fc58c1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_mmu_mark_parents_unsync(sp); } else if (sp->unsync) kvm_mmu_mark_parents_unsync(sp); @@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f7a6fdcf8ef..587b99d37d4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1494,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) */ pr_err("KVM: Guest triggered AMD Erratum 383\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); return; } diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 564548fbb3d..e16a0dbe74d 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -32,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); } if (waitqueue_active(q)) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 345a3547051..661c6e199b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -899,7 +899,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) unsigned long sysenter_esp; kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9be6e4e5e8e..7ef44107a14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -296,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -948,7 +948,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v) if (!vcpu->time_page) return 0; - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); return 1; } @@ -2253,7 +2253,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -4617,7 +4617,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->run->request_interrupt_window; if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); r = kvm_mmu_reload(vcpu); @@ -4625,26 +4625,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) kvm_write_guest_time(vcpu); - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_x86_ops->tlb_flush(vcpu); - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } @@ -4773,7 +4772,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: @@ -5255,7 +5254,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); } -- cgit v1.2.3 From 7ac77099ce88a0c31b75acd0ec5ef3da4415a6d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 10:57:45 +0300 Subject: KVM: Prevent internal slots from being COWed If a process with a memory slot is COWed, the page will change its address (despite having an elevated reference count). This breaks internal memory slots which have their physical addresses loaded into vmcs registers (see the APIC access memory slot). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ef44107a14..68be38e233f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5491,6 +5491,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5503,7 +5508,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); -- cgit v1.2.3 From 6c3f6041172b78d5532c6bf3680d304e92ec2e66 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 22 Jun 2010 13:49:21 +0800 Subject: KVM: x86: Enable AVX for guest Enable Intel(R) Advanced Vector Extension(AVX) for guest. The detection of AVX feature includes OSXSAVE bit testing. When OSXSAVE bit is not set, even if AVX is supported, the AVX instruction would result in UD as well. So we're safe to expose AVX bits to guest directly. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 68be38e233f..d39d6b25d3e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1963,13 +1963,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */; + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | -- cgit v1.2.3 From 529df65e394e30a78f2633b575fd81fa5b973e30 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Mon, 21 Jun 2010 11:29:40 -0400 Subject: KVM: Search the LAPIC's for one that will accept a PIC interrupt Older versions of 32-bit linux have a "Checking 'hlt' instruction" test where they repeatedly call the 'hlt' instruction, and then expect a timer interrupt to kick the CPU out of halt. This happens before any LAPIC or IOAPIC setup happens, which means that all of the APIC's are in virtual wire mode at this point. Unfortunately, the current implementation of virtual wire mode is hardcoded to only kick the BSP, so if a crash+kexec occurs on a different vcpu, it will never get kicked. This patch makes pic_unlock() do the equivalent of kvm_irq_delivery_to_apic() for the IOAPIC code. That is, it runs through all of the vcpus looking for one that is in virtual wire mode. In the normal case where LAPICs and IOAPICs are configured, this won't be used at all. In the bootstrap phase of a modern OS, before the LAPICs and IOAPICs are configured, this will have exactly the same behavior as today; VCPU0 is always looked at first, so it will always get out of the loop after the first iteration. This will only go through the loop more than once during a kexec/kdump, in which case it will only do it a few times until the kexec'ed kernel programs the LAPIC and IOAPIC. Signed-off-by: Chris Lalancette Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bc10f0bd381..819b748a33f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -46,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *found = NULL; + int i; s->wakeup_needed = false; raw_spin_unlock(&s->lock); if (wakeup) { - vcpu = s->kvm->bsp_vcpu; - if (vcpu) - kvm_vcpu_kick(vcpu); + kvm_for_each_vcpu(i, vcpu, s->kvm) { + if (kvm_apic_accept_pic_intr(vcpu)) { + found = vcpu; + break; + } + } + + if (!found) + found = s->kvm->bsp_vcpu; + + kvm_vcpu_kick(found); } } -- cgit v1.2.3 From 3e0075094734de122e4cb09f930fa853a3c59f09 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Jun 2010 14:26:18 +0300 Subject: KVM: Simplify vcpu_enter_guest() mmu reload logic slightly No need to reload the mmu in between two different vcpu->requests checks. kvm_mmu_reload() may trigger KVM_REQ_TRIPLE_FAULT, but that will be caught during atomic guest entry later. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d39d6b25d3e..27322d34123 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4616,15 +4616,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) + if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) @@ -4649,6 +4643,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); -- cgit v1.2.3 From f5f48ee15c2ee3e44cf429e34b16c6fa9b900246 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 30 Jun 2010 12:25:15 +0800 Subject: KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or CLFLUSH, we need to maintain data consistency either by: 1: flushing cache (wbinvd) when the guest is scheduled out if there is no wbinvd exit, or 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/emulate.c | 5 ++++- arch/x86/kvm/svm.c | 7 +++++++ arch/x86/kvm/vmx.c | 10 +++++++++- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a57cdeacc4d..2bda62485c4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -358,6 +359,8 @@ struct kvm_vcpu_arch { /* fields used by HYPER-V emulation */ u64 hv_vapic; + + cpumask_var_t wbinvd_dirty_mask; }; struct kvm_arch { @@ -514,6 +517,8 @@ struct kvm_x86_ops { void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + bool (*has_wbinvd_exit)(void); + const struct trace_print_flags *exit_reasons_str; }; @@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abb8cec420a..e8bdddc4509 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3138,8 +3138,11 @@ twobyte_insn: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; break; - case 0x08: /* invd */ case 0x09: /* wbinvd */ + kvm_emulate_wbinvd(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ c->dst.type = OP_NONE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 587b99d37d4..56c9b6bd765 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_has_wbinvd_exit(void) +{ + return true; +} + static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .set_supported_cpuid = svm_set_supported_cpuid, + + .has_wbinvd_exit = svm_has_wbinvd_exit, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 661c6e199b4..4dfb1dc09c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -3397,7 +3403,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - /* TODO: Add support for VT-d/pass-through device */ + kvm_emulate_wbinvd(vcpu); return 1; } @@ -4347,6 +4353,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .rdtscp_supported = vmx_rdtscp_supported, .set_supported_cpuid = vmx_set_supported_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27322d34123..3d72fc06705 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1783,8 +1783,28 @@ out: return r; } +static void wbinvd_ipi(void *garbage) +{ + wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + /* Address WBINVD may be executed by guest */ + if (need_emulate_wbinvd(vcpu)) { + if (kvm_x86_ops->has_wbinvd_exit()) + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) + smp_call_function_single(vcpu->cpu, + wbinvd_ipi, NULL, 1); + } + kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { unsigned long khz = cpufreq_quick_get(cpu); @@ -3660,6 +3680,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + if (!need_emulate_wbinvd(vcpu)) + return X86EMUL_CONTINUE; + + if (kvm_x86_ops->has_wbinvd_exit()) { + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + wbinvd_ipi, NULL, 1); + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); + } + wbinvd(); + return X86EMUL_CONTINUE; +} +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); + int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); @@ -5263,6 +5298,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5392,7 +5428,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + goto fail_free_mce_banks; + return 0; +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: -- cgit v1.2.3 From 36a2e6774bfb5f32a0f23bb155f1f960321f291b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:02:02 +0800 Subject: KVM: MMU: fix writable sync sp mapping While we sync many unsync sp at one time(in mmu_sync_children()), we may mapping the spte writable, it's dangerous, if one unsync sp's mapping gfn is another unsync page's gfn. For example: SP1.pte[0] = P SP2.gfn's pfn = P [SP1.pte[0] = SP2.gfn's pfn] First, we write protected SP1 and SP2, but SP1 and SP2 are still the unsync sp. Then, sync SP1 first, it will detect SP1.pte[0].gfn only has one unsync-sp, that is SP2, so it will mapping it writable, but we plan to sync SP2 soon, at this point, the SP2->unsync is not reliable since later we sync SP2 but SP2->gfn is already writable. So the final result is: SP2 is the sync page but SP2.gfn is writable. This bug will corrupt guest's page table, fixed by mark read-only mapping if the mapped gfn has shadow pages. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 690a7fc58c1..ca07ed083b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1810,11 +1810,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool need_unsync = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!can_unsync) + return 1; + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; if (!need_unsync && !s->unsync) { - if (!can_unsync || !oos_shadow) + if (!oos_shadow) return 1; need_unsync = true; } -- cgit v1.2.3 From 5fd5387c89ec99ff6cb82d2477ffeb7211b781c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:02:45 +0800 Subject: KVM: MMU: fix conflict access permissions in direct sp In no-direct mapping, we mark sp is 'direct' when we mapping the guest's larger page, but its access is encoded form upper page-struct entire not include the last mapping, it will cause access conflict. For example, have this mapping: [W] / PDE1 -> |---| P[W] | | LPA \ PDE2 -> |---| [R] P have two children, PDE1 and PDE2, both PDE1 and PDE2 mapping the same lage page(LPA). The P's access is WR, PDE1's access is WR, PDE2's access is RO(just consider read-write permissions here) When guest access PDE1, we will create a direct sp for LPA, the sp's access is from P, is W, then we will mark the ptes is W in this sp. Then, guest access PDE2, we will find LPA's shadow page, is the same as PDE's, and mark the ptes is RO. So, if guest access PDE1, the incorrect #PF is occured. Fixed by encode the last mapping access into direct shadow page Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a21a86ef9e2..f4e4aaa65ff 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -339,6 +339,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 1; if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; + access &= gw->pte_access; + /* * It is a large guest pages backed by small host pages, * So we set @direct(@sp->role.direct)=1, and set -- cgit v1.2.3 From 9e7b0e7fba45ca3c6357aeb7091ebc281f1de365 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:03:28 +0800 Subject: KVM: MMU: fix direct sp's access corrupted If the mapping is writable but the dirty flag is not set, we will find the read-only direct sp and setup the mapping, then if the write #PF occur, we will mark this mapping writable in the read-only direct sp, now, other real read-only mapping will happily write it without #PF. It may hurt guest's COW Fixed by re-install the mapping when write #PF occur. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f4e4aaa65ff..117d63f6304 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -325,8 +325,32 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - continue; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + unsigned direct_access; + + if (level != gw->level) + continue; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + direct_access = gw->pt_access & gw->pte_access; + if (!is_dirty_gpte(gw->ptes[gw->level - 1])) + direct_access &= ~ACC_WRITE_MASK; + + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + continue; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); -- cgit v1.2.3 From 84754cd8fca66ed476585eabad68cacf42834199 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:05:00 +0800 Subject: KVM: MMU: cleanup FNAME(fetch)() functions Cleanup this function that we are already get the direct sp's access Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 117d63f6304..59e750c1a26 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -306,12 +306,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, gfn_t table_gfn; int r; int level; + bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + unsigned direct_access; pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; + direct_access = gw->pt_access & gw->pte_access; + if (!dirty) + direct_access &= ~ACC_WRITE_MASK; + for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -319,15 +325,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, - is_dirty_gpte(gw->ptes[gw->level-1]), - ptwrite, level, + dirty, ptwrite, level, gw->gfn, pfn, false, true); break; } if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; - unsigned direct_access; if (level != gw->level) continue; @@ -339,10 +343,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, * so we should update the spte at this point to get * a new sp with the correct access. */ - direct_access = gw->pt_access & gw->pte_access; - if (!is_dirty_gpte(gw->ptes[gw->level - 1])) - direct_access &= ~ACC_WRITE_MASK; - child = page_header(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) continue; @@ -359,11 +359,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (level <= gw->level) { - int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_gpte(gw->ptes[level - delta])) - access &= ~ACC_WRITE_MASK; - access &= gw->pte_access; + access = direct_access; /* * It is a large guest pages backed by small host pages, -- cgit v1.2.3 From 828554136bbacae6e39fc31b9cd7e7c660ad7530 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Jul 2010 16:00:11 +0200 Subject: KVM: Remove unnecessary divide operations This patch converts unnecessary divide and modulo operations in the KVM large page related code into logical operations. This allows to convert gfn_t to u64 while not breaking 32 bit builds. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2bda62485c4..50c79b9f5c3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -44,7 +44,8 @@ /* KVM Hugepage definitions for x86 */ #define KVM_NR_PAGE_SIZES 3 -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ca07ed083b5..a20fd613acf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -423,8 +423,8 @@ static int *slot_largepage_idx(gfn_t gfn, { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].write_count; } @@ -528,8 +528,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].rmap_pde; } -- cgit v1.2.3 From c15a5958a0b6dbf06b3c05972694f04a0c50a4cf Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:22 -0400 Subject: x86-64, asm: Directly access per-cpu IST Use a direct per-cpu reference for the IST instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-1-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ff..59af275b37a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,6 +1065,7 @@ ENTRY(\sym) END(\sym) .endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) INTR_FRAME @@ -1076,10 +1077,9 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %r12) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) -- cgit v1.2.3 From 72c511dd596cff88d6523f231a0fbb8f73006d51 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:23 -0400 Subject: x86-32, asm: Directly access per-cpu GDT Use a direct per-cpu reference for the GDT instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf15..233c5829e7a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -611,14 +611,14 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ sub %eax, %edx /* offset (low word is 0) */ - PER_CPU(gdt_page, %ebx) shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 push %eax /* new kernel esp */ @@ -791,9 +791,8 @@ ptregs_clone: * normal stack and adjusts ESP with the matching offset. */ /* fixup the stack */ - PER_CPU(gdt_page, %ebx) - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS -- cgit v1.2.3 From cc05152ab72d7a65e6ea97d286af4f878c8f7371 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sat, 31 Jul 2010 22:51:01 +0200 Subject: x86,mmiotrace: Add support for tracing STOS instruction Add support for stos access tracing with mmiotrace. Signed-off-by: Marcin Slusarz Acked-by: Pekka Paalanen Cc: Nouveau Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Steven Rostedt LKML-Reference: <20100731205101.GA5860@joi.lan> Signed-off-by: Frederic Weisbecker --- arch/x86/mm/pf_in.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 308e32570d8..38e6d174c49 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = { static unsigned int reg_rop[] = { 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F }; -static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; static unsigned int imm_wop[] = { 0xC6, 0xC7 }; /* IA32 Manual 3, 3-432*/ -static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; static unsigned int rw32[] = { - 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB }; -static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; static unsigned int mw16[] = { 0xB70F, 0xBF0F }; -static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; static unsigned int mw64[] = {}; #else /* not __i386__ */ static unsigned char prefix_codes[] = { @@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = { static unsigned int reg_rop[] = { 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F }; -static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; static unsigned int imm_wop[] = { 0xC6, 0xC7 }; -static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; static unsigned int rw32[] = { - 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB }; /* 8 bit only */ -static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; /* 16 bit only */ static unsigned int mw16[] = { 0xB70F, 0xBF0F }; /* 16 or 32 bit */ static unsigned int mw32[] = { 0xC7 }; /* 16, 32 or 64 bit */ -static unsigned int mw64[] = { 0x89, 0x8B }; +static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; #endif /* not __i386__ */ struct prefix_bits { @@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs) unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) { unsigned int opcode; - unsigned char mod_rm; int reg; unsigned char *p; struct prefix_bits prf; @@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) goto err; do_work: - mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + /* for STOS, source register is fixed */ + if (opcode == 0xAA || opcode == 0xAB) { + reg = arg_AX; + } else { + unsigned char mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + } switch (get_ins_reg_width(ins_addr)) { case 1: return *get_reg_w8(reg, prf.rex, regs); -- cgit v1.2.3 From dd180b3e90253cb4ca95d603a8c17413f8daec69 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 3 Jul 2010 16:02:42 +0800 Subject: KVM: VMX: fix tlb flush with invalid root Commit 341d9b535b6c simplify reload logic while entry guest mode, it can avoid unnecessary sync-root if KVM_REQ_MMU_RELOAD and KVM_REQ_MMU_SYNC both set. But, it cause a issue that when we handle 'KVM_REQ_TLB_FLUSH', the root is invalid, it is triggered during my test: Kernel BUG at ffffffffa00212b8 [verbose debug info unavailable] ...... Fixed by directly return if the root is not ready. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 2 -- arch/x86/kvm/vmx.c | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 50c79b9f5c3..502e53f999c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,6 +40,8 @@ 0xFFFFFF0000000000ULL) #define INVALID_PAGE (~(hpa_t)0) +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a20fd613acf..70cdf6876b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -92,8 +92,6 @@ module_param(oos_shadow, bool, 0644); #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4dfb1dc09c8..2fdcc9819f3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1828,8 +1828,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_context(to_vmx(vcpu)); - if (enable_ept) + if (enable_ept) { + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From be38d276b0189fa86231fc311428622a1981ad62 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:31:27 +0300 Subject: KVM: MMU: Introduce drop_spte() When we call rmap_remove(), we (almost) always immediately follow it by an __set_spte() to a nonpresent pte. Since we need to perform the two operations atomically, to avoid losing the dirty and accessed bits, introduce a helper drop_spte() and convert all call sites. The operation is still nonatomic at this point. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 30 +++++++++++++++++------------- arch/x86/kvm/paging_tmpl.h | 13 ++++++------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70cdf6876b5..1ad39cf70e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + rmap_remove(kvm, sptep); + __set_spte(sptep, new_spte); +} + static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; @@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - rmap_remove(kvm, spte); + drop_spte(kvm, spte, + shadow_trap_nonpresent_pte); --kvm->stat.lpages; - __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } else { if (is_large_pte(ent)) --kvm->stat.lpages; - rmap_remove(kvm, &pt[i]); + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); } } pt[i] = shadow_trap_nonpresent_pte; @@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - rmap_remove(vcpu->kvm, sptep); - spte = shadow_trap_nonpresent_pte; - goto set_pte; + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + goto done; } spte |= PT_WRITABLE_MASK; @@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: __set_spte(sptep, spte); +done: return ret; } @@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; @@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - rmap_remove(vcpu->kvm, spte); + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 59e750c1a26..796a325c7e5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,8 +353,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (is_large_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -516,12 +515,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; + drop_spte(vcpu->kvm, sptep, + shadow_trap_nonpresent_pte); need_flush = 1; - } - __set_spte(sptep, shadow_trap_nonpresent_pte); + } else + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } @@ -637,12 +637,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; - rmap_remove(vcpu->kvm, &sp->spt[i]); if (is_present_gpte(gpte) || !clear_unsync) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - __set_spte(&sp->spt[i], nonpresent); + drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); continue; } -- cgit v1.2.3 From ce061867aa2877605cda96fa8ec7dff15f70a983 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:38:12 +0300 Subject: KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Since we need to make the check atomic, move it to the place that will set the new spte. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ad39cf70e1..fbdca08b8d8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - pfn_t pfn; gfn_t gfn; unsigned long *rmapp; int i; - if (!is_rmap_spte(*spte)) - return; sp = page_header(__pa(spte)); - pfn = spte_to_pfn(*spte); - if (*spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(pfn); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { @@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { + pfn_t pfn; + + if (!is_rmap_spte(*sptep)) { + __set_spte(sptep, new_spte); + return; + } + pfn = spte_to_pfn(*sptep); + if (*sptep & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (is_writable_pte(*sptep)) + kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); __set_spte(sptep, new_spte); } -- cgit v1.2.3 From a9221dd5ec125fbec1702fae016c6d2ea1a9a3da Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:48:06 +0300 Subject: KVM: MMU: Atomically check for accessed bit when dropping an spte Currently, in the window between the check for the accessed bit, and actually dropping the spte, a vcpu can access the page through the spte and set the bit, which will be ignored by the mmu. Fix by using an exchange operation to atmoically fetch the spte and drop it. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fbdca08b8d8..ba2efcf2b86 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte) #endif } +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{ +#ifdef CONFIG_X86_64 + return xchg(sptep, new_spte); +#else + u64 old_spte; + + do { + old_spte = *sptep; + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + + return old_spte; +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { pfn_t pfn; + u64 old_spte; - if (!is_rmap_spte(*sptep)) { - __set_spte(sptep, new_spte); + old_spte = __xchg_spte(sptep, new_spte); + if (!is_rmap_spte(old_spte)) return; - } - pfn = spte_to_pfn(*sptep); - if (*sptep & shadow_accessed_mask) + pfn = spte_to_pfn(old_spte); + if (old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*sptep)) + if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); - __set_spte(sptep, new_spte); } static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -- cgit v1.2.3 From b79b93f92cb3b66b89d75525fdfd2454b1e1f446 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 15:46:44 +0300 Subject: KVM: MMU: Don't drop accessed bit while updating an spte __set_spte() will happily replace an spte with the accessed bit set with one that has the accessed bit clear. Add a helper update_spte() which checks for this condition and updates the page flag if needed. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba2efcf2b86..d8d48329cb8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static void update_spte(u64 *sptep, u64 new_spte) +{ + u64 old_spte; + + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + __set_spte(sptep, new_spte); + } else { + old_spte = __xchg_spte(sptep, new_spte); + if (old_spte & shadow_accessed_mask) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + } +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - __set_spte(spte, *spte & ~PT_WRITABLE_MASK); + update_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte; + u64 *spte, new_spte, old_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_accessed_mask; if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); - __set_spte(spte, new_spte); + old_spte = __xchg_spte(spte, new_spte); + if (is_shadow_present_pte(old_spte) + && (old_spte & shadow_accessed_mask)) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } } @@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - __set_spte(sptep, spte); + update_spte(sptep, spte); done: return ret; } -- cgit v1.2.3 From a5046e6c7d97d6574ffe6367311ea0b0de56aa58 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:49:05 +0800 Subject: KVM: x86 emulator: fix 'mov sreg,rm16' instruction decoding Memory reads for 'mov sreg,rm16' should be 16 bits only. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8bdddc4509..d842a7d2bc6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -170,7 +170,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - ImplicitOps | SrcMem | ModRM, Group | Group1A, + ImplicitOps | SrcMem16 | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ -- cgit v1.2.3 From ce7a0ad3bdcd86e6cf907eb5992fecb1503daa26 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:50:21 +0800 Subject: KVM: x86 emulator: fix the comment of out instruction Fix the comment of out instruction, using the same style as the other instructions. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d842a7d2bc6..ad8d7cdd1eb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2949,8 +2949,8 @@ special_insn: &c->dst.val)) goto done; /* IO is needed */ break; - case 0xee: /* out al,dx */ - case 0xef: /* out (e/r)ax,dx */ + case 0xee: /* out dx,al */ + case 0xef: /* out dx,(e/r)ax */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_out: c->dst.bytes = min(c->dst.bytes, 4u); -- cgit v1.2.3 From e97e883f8bfbe02cfc2bfff45e68921dfe590c7e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:51:09 +0800 Subject: KVM: x86 emulator: fix 'and AL,imm8' instruction decoding 'and AL,imm8' should be mask as ByteOp, otherwise the dest operand length will no correct and we may fill the full EAX when writeback. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ad8d7cdd1eb..59568ad21ab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -123,7 +123,7 @@ static u32 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit v1.2.3 From b16b2b7bb5a78afceb7fe22f2a04476cd70182b7 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:52:53 +0800 Subject: KVM: x86 emulator: fix 'mov rm,sreg' instruction decoding The source operand of 'mov rm,sreg' is segment register, not general-purpose register, so remove SrcReg from decoding. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 59568ad21ab..8337567a0f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -169,7 +169,7 @@ static u32 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, ImplicitOps | SrcMem16 | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, -- cgit v1.2.3 From 07cbc6c185aee2c0479776845988242a040c7c93 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:54:19 +0800 Subject: KVM: x86 emulator: fix cli/sti instruction emulation If IOPL check fail, the cli/sti emulate GP and then we should skip writeback since the default write OP is OP_REG. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8337567a0f4..286572a5675 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2979,17 +2979,19 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt, ops)) + if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); - else { + goto done; + } else { ctxt->eflags &= ~X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt, ops)) + if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); - else { + goto done; + } else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ -- cgit v1.2.3 From 5d55f299f97769130c6cc67896414c988db309ab Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 7 Jul 2010 17:43:35 +0800 Subject: KVM: x86 emulator: re-implementing 'mov AL,moffs' instruction decoding This patch change to use DstAcc for decoding 'mov AL, moffs' and introduced SrcAcc for decoding 'mov moffs, AL'. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 286572a5675..255473f974a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -70,6 +70,7 @@ #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ +#define SrcAcc (0xd<<4) /* Source Accumulator */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -177,8 +178,8 @@ static u32 opcode_table[256] = { 0, 0, SrcImmFAddr | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ @@ -1186,6 +1187,25 @@ done_prefixes: else c->src.val = insn_fetch(u8, 1, c->eip); break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->src.bytes) { + case 1: + c->src.val = *(u8 *)c->src.ptr; + break; + case 2: + c->src.val = *(u16 *)c->src.ptr; + break; + case 4: + c->src.val = *(u32 *)c->src.ptr; + break; + case 8: + c->src.val = *(u64 *)c->src.ptr; + break; + } + break; case SrcOne: c->src.bytes = 1; c->src.val = 1; @@ -2854,13 +2874,7 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; + case 0xa0 ... 0xa3: /* mov */ case 0xa4 ... 0xa5: /* movs */ goto mov; case 0xa6 ... 0xa7: /* cmps */ -- cgit v1.2.3 From b0eeec29fe7a5b114000f769bd68ffa02652bfb7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Jul 2010 15:40:18 +0300 Subject: KVM: MMU: Only indicate a fetch fault in page fault error code if nx is enabled Bit 4 of the page fault error code is set only if EFER.NX is set. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 796a325c7e5..3a3f6d784d7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -245,7 +245,7 @@ err: walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) + if (fetch_fault && is_nx(vcpu)) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; -- cgit v1.2.3 From f59c1d2ded54e4bd7a9126f4a32c9eca8b336457 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Jul 2010 16:20:43 +0300 Subject: KVM: MMU: Keep going on permission error Real hardware disregards permission errors when computing page fault error code bit 0 (page present). Do the same. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 52 ++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 3a3f6d784d7..1cea41cad06 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -119,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker, { pt_element_t pte; gfn_t table_gfn; - unsigned index, pt_access, pte_access; + unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - int rsvd_fault = 0; + bool eperm, present, rsvd_fault; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: + present = true; + eperm = rsvd_fault = false; walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + goto error; + } --walker->level; } #endif @@ -151,31 +155,36 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) - goto not_present; + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + present = false; + break; + } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + break; + } - rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); - if (rsvd_fault) - goto access_error; + if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + rsvd_fault = true; + break; + } if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) - goto access_error; + eperm = true; if (user_fault && !(pte & PT_USER_MASK)) - goto access_error; + eperm = true; #if PTTYPE == 64 if (fetch_fault && (pte & PT64_NX_MASK)) - goto access_error; + eperm = true; #endif - if (!(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, @@ -214,6 +223,9 @@ walk: --walker->level; } + if (!present || eperm || rsvd_fault) + goto error; + if (write_fault && !is_dirty_gpte(pte)) { bool ret; @@ -233,14 +245,10 @@ walk: __func__, (u64)pte, pte_access, pt_access); return 1; -not_present: +error: walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: + if (present) + walker->error_code |= PFERR_PRESENT_MASK; if (write_fault) walker->error_code |= PFERR_WRITE_MASK; if (user_fault) -- cgit v1.2.3 From 673813e81d8468e80b6dd0fa839923eb9748dc49 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 7 Jul 2010 15:02:25 +0200 Subject: KVM: fix lock imbalance in kvm_create_pit() Stanse found that there is an omitted unlock in kvm_create_pit in one fail path. Add proper unlock there. Signed-off-by: Jiri Slaby Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Gleb Natapov Cc: "Michael S. Tsirkin" Cc: Gregory Haskins Cc: kvm@vger.kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 467cc47fb73..70db4d43539 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -696,6 +696,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->wq = create_singlethread_workqueue("kvm-pit-wq"); if (!pit->wq) { + mutex_unlock(&pit->pit_state.lock); kfree(pit); return NULL; } -- cgit v1.2.3 From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 7 Jul 2010 20:16:45 +0300 Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area Currently if guest access address that belongs to memory slot but is not backed up by page or page is read only KVM treats it like MMIO access. Remove that capability. It was never part of the interface and should not be relied upon. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d8d48329cb8..89d7a2cae53 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) if (is_hwpoison_pfn(pfn)) { kvm_send_hwpoison_signal(kvm, gfn); return 0; - } + } else if (is_fault_pfn(pfn)) + return -EFAULT; + return 1; } -- cgit v1.2.3 From a6f177efaa5856e22ed0d3c1e81e65b41654d083 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 8 Jul 2010 12:41:12 +0300 Subject: KVM: Reenter guest after emulation failure if due to access to non-mmio address When shadow pages are in use sometimes KVM try to emulate an instruction when it accesses a shadowed page. If emulation fails KVM un-shadows the page and reenter guest to allow vcpu to execute the instruction. If page is not in shadow page hash KVM assumes that this was attempt to do MMIO and reports emulation failure to userspace since there is no way to fix the situation. This logic has a race though. If two vcpus tries to write to the same shadowed page simultaneously both will enter emulator, but only one of them will find the page in shadow page hash since the one who founds it also removes it from there, so another cpu will report failure to userspace and will abort the guest. Fix this by checking (in addition to checking shadowed page hash) that page that caused the emulation belongs to valid memory slot. If it is then reenter the guest to allow vcpu to reexecute the instruction. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d72fc06705..d51eed239b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3930,6 +3930,29 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) return EMULATE_FAIL; } +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3998,7 +4021,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4019,12 +4042,7 @@ restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); if (r) { /* emulation failed */ - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return handle_emulation_failure(vcpu); -- cgit v1.2.3 From aea924f606c309feead37ab5c43f410a08ff3826 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 10 Jul 2010 17:37:56 +0800 Subject: KVM: PIT: stop vpit before freeing irq_routing Fix: general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ...... Call Trace: [] ? kvm_set_irq+0xdd/0x24b [kvm] [] ? trace_hardirqs_off_caller+0x1f/0x10e [] ? sub_preempt_count+0xe/0xb6 [] ? put_lock_stats+0xe/0x27 ... RIP [] kvm_set_irq+0x17e/0x24b [kvm] This bug is triggered when guest is shutdown, is because we freed irq_routing before pit thread stopped Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 3 +++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 70db4d43539..0fd6378981f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -752,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &kvm->arch.vpit->speaker_dev); kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d51eed239b4..d721e2d81a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5523,12 +5523,12 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); -- cgit v1.2.3 From 908e75f3e70ca580cc20442cf6780dcc2d0557b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Jul 2010 14:09:38 +0300 Subject: KVM: Expose MCE control MSRs to userspace Userspace needs to reset and save/restore these MSRs. The MCE banks are not exposed since their number varies from vcpu to vcpu. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d721e2d81a5..eb55ec55125 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -744,6 +744,8 @@ static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, }; static int set_efer(struct kvm_vcpu *vcpu, u64 efer) -- cgit v1.2.3 From 32ef26a3598636be520abed90ed0c2f439d36bbe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:04 +0300 Subject: KVM: MMU: Add link_shadow_page() helper To simplify the process of fetching an spte, add a helper that links a shadow page to an spte. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89d7a2cae53..df3a7a79cce 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1482,6 +1482,16 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1cea41cad06..36dc0749c87 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -309,7 +309,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *sp; - u64 spte, *sptep = NULL; + u64 *sptep = NULL; int direct; gfn_t table_gfn; int r; @@ -395,10 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } } - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + link_shadow_page(sptep, sp); } return sptep; -- cgit v1.2.3 From 121eee97a7802acda8b78436cc53196e9885549f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:05 +0300 Subject: KVM: MMU: Use __set_spte to link shadow pages To avoid split accesses to 64 bit sptes on i386, use __set_spte() to link shadow pages together. (not technically required since shadow pages are __GFP_KERNEL, so upper 32 bits are always clear) Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df3a7a79cce..5a6019a534a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1489,7 +1489,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + __set_spte(sptep, spte); } static void kvm_mmu_page_unlink_children(struct kvm *kvm, -- cgit v1.2.3 From a3aa51cfaafe9179add88db20506ccb07e030b47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:06 +0300 Subject: KVM: MMU: Add drop_large_spte() helper To clarify spte fetching code, move large spte handling into a helper. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++++++ arch/x86/kvm/paging_tmpl.h | 5 +---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a6019a534a..b75d6cb44ab 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1492,6 +1492,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) __set_spte(sptep, spte); } +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 36dc0749c87..0fb7068d64c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -360,10 +360,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, kvm_flush_remote_tlbs(vcpu->kvm); } - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } + drop_large_spte(vcpu, sptep); if (level <= gw->level) { direct = 1; -- cgit v1.2.3 From a357bd229cdaf37a41798d238ab50b34c71dd0d6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:07 +0300 Subject: KVM: MMU: Add validate_direct_spte() helper Add a helper to verify that a direct shadow page is valid wrt the required access permissions; drop the page if it is not valid. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 23 +++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 27 ++++++--------------------- 2 files changed, 29 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b75d6cb44ab..36c62f33513 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1500,6 +1500,29 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) } } +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned direct_access) +{ + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + return; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0fb7068d64c..0c7461d3a5b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -338,30 +338,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - - if (level != gw->level) - continue; - - /* - * For the direct sp, if the guest pte's dirty bit - * changed form clean to dirty, it will corrupt the - * sp's access: allow writable in the read-only sp, - * so we should update the spte at this point to get - * a new sp with the correct access. - */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); - if (child->role.access == direct_access) - continue; - - mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) + && level == gw->level) + validate_direct_spte(vcpu, sptep, direct_access); drop_large_spte(vcpu, sptep); + if (is_shadow_present_pte(*sptep)) + continue; + if (level <= gw->level) { direct = 1; access = direct_access; -- cgit v1.2.3 From 39c8c672a18c52048343d7531dfb2dcf3431ee74 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:08 +0300 Subject: KVM: MMU: Add gpte_valid() helper Move the code to check whether a gpte has changed since we fetched it into a helper. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0c7461d3a5b..e1c1f9eb1cc 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -299,6 +299,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gpte_to_gfn(gpte), pfn, true, true); } +static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, + struct guest_walker *gw, int level) +{ + int r; + pt_element_t curr_pte; + + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -312,11 +323,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, u64 *sptep = NULL; int direct; gfn_t table_gfn; - int r; int level; bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); unsigned direct_access; - pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; if (!is_present_gpte(gw->ptes[gw->level - 1])) @@ -365,17 +374,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, direct, access, sptep); - if (!direct) { - r = kvm_read_guest_atomic(vcpu->kvm, - gw->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw->ptes[level - 2]) { + if (!direct) + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, level - 1)) { kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); sptep = NULL; break; } - } link_shadow_page(sptep, sp); } -- cgit v1.2.3 From 0b3c933302262d83018dd5f69656bca9f28a0cd3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:09 +0300 Subject: KVM: MMU: Simplify spte fetch() function Partition the function into three sections: - fetching indirect shadow pages (host_level > guest_level) - fetching direct shadow pages (page_level < host_level <= guest_level) - the final spte (page_level == host_level) Instead of the current spaghetti. A slight change from the original code is that we call validate_direct_spte() more often: previously we called it only for gw->level, now we also call it for lower levels. The change should have no effect. [xiao: fix regression caused by validate_direct_spte() called too late] Signed-off-by: Avi Kivity Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 93 ++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e1c1f9eb1cc..368e4cb6233 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -321,9 +321,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, unsigned access = gw->pt_access; struct kvm_mmu_page *sp; u64 *sptep = NULL; - int direct; - gfn_t table_gfn; - int level; + int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); unsigned direct_access; struct kvm_shadow_walk_iterator iterator; @@ -335,61 +333,68 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!dirty) direct_access &= ~ACC_WRITE_MASK; - for_each_shadow_entry(vcpu, addr, iterator) { + for (shadow_walk_init(&iterator, vcpu, addr); + shadow_walk_okay(&iterator) && iterator.level > gw->level; + shadow_walk_next(&iterator)) { + gfn_t table_gfn; + level = iterator.level; sptep = iterator.sptep; - if (iterator.level == hlevel) { - mmu_set_spte(vcpu, sptep, access, - gw->pte_access & access, - user_fault, write_fault, - dirty, ptwrite, level, - gw->gfn, pfn, false, true); - break; - } - - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) - && level == gw->level) - validate_direct_spte(vcpu, sptep, direct_access); drop_large_spte(vcpu, sptep); if (is_shadow_present_pte(*sptep)) continue; - if (level <= gw->level) { - direct = 1; - access = direct_access; - - /* - * It is a large guest pages backed by small host pages, - * So we set @direct(@sp->role.direct)=1, and set - * @table_gfn(@sp->gfn)=the base page frame for linear - * translations. - */ - table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - access &= gw->pte_access; - } else { - direct = 0; - table_gfn = gw->table_gfn[level - 2]; - } + table_gfn = gw->table_gfn[level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - direct, access, sptep); - if (!direct) - /* - * Verify that the gpte in the page we've just write - * protected is still there. - */ - if (FNAME(gpte_changed)(vcpu, gw, level - 1)) { - kvm_mmu_put_page(sp, sptep); - kvm_release_pfn_clean(pfn); - sptep = NULL; - break; - } + false, access, sptep); + + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, level - 1)) + goto out_gpte_changed; + + link_shadow_page(sptep, sp); + } + + for (; + shadow_walk_okay(&iterator) && iterator.level > hlevel; + shadow_walk_next(&iterator)) { + gfn_t direct_gfn; + level = iterator.level; + sptep = iterator.sptep; + + validate_direct_spte(vcpu, sptep, direct_access); + + drop_large_spte(vcpu, sptep); + + if (is_shadow_present_pte(*sptep)) + continue; + + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1, + true, direct_access, sptep); link_shadow_page(sptep, sp); } + sptep = iterator.sptep; + level = iterator.level; + + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, level, + gw->gfn, pfn, false, true); + return sptep; + +out_gpte_changed: + kvm_mmu_put_page(sp, sptep); + kvm_release_pfn_clean(pfn); + return NULL; } /* -- cgit v1.2.3 From 5991b33237b7fc7dd9f62ae04998c42217d444a7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:10 +0300 Subject: KVM: MMU: Validate all gptes during fetch, not just those used for new pages Currently, when we fetch an spte, we only verify that gptes match those that the walker saw if we build new shadow pages for them. However, this misses the following race: vcpu1 vcpu2 walk change gpte walk instantiate sp fetch existing sp Fix by validating every gpte, regardless of whether it is used for building a new sp or not. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 368e4cb6233..8cb85f9c8ad 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -319,10 +319,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp = NULL; u64 *sptep = NULL; int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + int top_level; unsigned direct_access; struct kvm_shadow_walk_iterator iterator; @@ -333,6 +334,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!dirty) direct_access &= ~ACC_WRITE_MASK; + top_level = vcpu->arch.mmu.root_level; + if (top_level == PT32E_ROOT_LEVEL) + top_level = PT32_ROOT_LEVEL; + /* + * Verify that the top-level gpte is still there. Since the page + * is a root page, it is either write protected (and cannot be + * changed from now on) or it is invalid (in which case, we don't + * really care if it changes underneath us after this point). + */ + if (FNAME(gpte_changed)(vcpu, gw, top_level)) + goto out_gpte_changed; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator) && iterator.level > gw->level; shadow_walk_next(&iterator)) { @@ -343,12 +356,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, drop_large_spte(vcpu, sptep); - if (is_shadow_present_pte(*sptep)) - continue; - - table_gfn = gw->table_gfn[level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - false, access, sptep); + sp = NULL; + if (!is_shadow_present_pte(*sptep)) { + table_gfn = gw->table_gfn[level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + false, access, sptep); + } /* * Verify that the gpte in the page we've just write @@ -357,7 +370,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, level - 1)) goto out_gpte_changed; - link_shadow_page(sptep, sp); + if (sp) + link_shadow_page(sptep, sp); } for (; @@ -392,7 +406,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, return sptep; out_gpte_changed: - kvm_mmu_put_page(sp, sptep); + if (sp) + kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); return NULL; } -- cgit v1.2.3 From 24157aaf833261e68e5a398fa54bd15e4fa1d0b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:11 +0300 Subject: KVM: MMU: Eliminate redundant temporaries in FNAME(fetch) 'level' and 'sptep' are aliases for 'interator.level' and 'iterator.sptep', no need for them. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 59 +++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8cb85f9c8ad..d9a2742014e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -320,12 +320,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - u64 *sptep = NULL; - int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); int top_level; unsigned direct_access; - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; @@ -346,68 +344,59 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - for (shadow_walk_init(&iterator, vcpu, addr); - shadow_walk_okay(&iterator) && iterator.level > gw->level; - shadow_walk_next(&iterator)) { + for (shadow_walk_init(&it, vcpu, addr); + shadow_walk_okay(&it) && it.level > gw->level; + shadow_walk_next(&it)) { gfn_t table_gfn; - level = iterator.level; - sptep = iterator.sptep; - - drop_large_spte(vcpu, sptep); + drop_large_spte(vcpu, it.sptep); sp = NULL; - if (!is_shadow_present_pte(*sptep)) { - table_gfn = gw->table_gfn[level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - false, access, sptep); + if (!is_shadow_present_pte(*it.sptep)) { + table_gfn = gw->table_gfn[it.level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, + false, access, it.sptep); } /* * Verify that the gpte in the page we've just write * protected is still there. */ - if (FNAME(gpte_changed)(vcpu, gw, level - 1)) + if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; if (sp) - link_shadow_page(sptep, sp); + link_shadow_page(it.sptep, sp); } for (; - shadow_walk_okay(&iterator) && iterator.level > hlevel; - shadow_walk_next(&iterator)) { + shadow_walk_okay(&it) && it.level > hlevel; + shadow_walk_next(&it)) { gfn_t direct_gfn; - level = iterator.level; - sptep = iterator.sptep; + validate_direct_spte(vcpu, it.sptep, direct_access); - validate_direct_spte(vcpu, sptep, direct_access); + drop_large_spte(vcpu, it.sptep); - drop_large_spte(vcpu, sptep); - - if (is_shadow_present_pte(*sptep)) + if (is_shadow_present_pte(*it.sptep)) continue; - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1, - true, direct_access, sptep); - link_shadow_page(sptep, sp); + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, + true, direct_access, it.sptep); + link_shadow_page(it.sptep, sp); } - sptep = iterator.sptep; - level = iterator.level; - - mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, - user_fault, write_fault, dirty, ptwrite, level, + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); - return sptep; + return it.sptep; out_gpte_changed: if (sp) - kvm_mmu_put_page(sp, sptep); + kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); return NULL; } -- cgit v1.2.3 From c0e0608cb902af1a1fd8d413ec0a07ee1e62c652 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 13 Jul 2010 16:40:23 +0300 Subject: KVM: x86: emulator: inc/dec can have lock prefix Mark inc (0xfe/0 0xff/0) and dec (0xfe/1 0xff/1) as lock prefix capable. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 255473f974a..b38bd8b92aa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -345,10 +345,10 @@ static u32 group_table[] = { DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, SrcMem | ModRM | Stack, 0, -- cgit v1.2.3 From 68be0803456b3eed33038be5566710ad7648c854 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 14 Jul 2010 19:05:45 +0300 Subject: KVM: x86: never re-execute instruction with enabled tdp With tdp enabled we should get into emulator only when emulating io, so reexecution will always bring us back into emulator. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eb55ec55125..689c2c3182a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3936,6 +3936,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; + if (tdp_enabled) + return false; + /* * if emulation was due to access to shadowed page table * and it failed try to unshadow page and re-entetr the -- cgit v1.2.3 From 9195c4da26bbf8860e2e7b648dbf4ab465c7933a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 15 Jul 2010 12:24:37 +0300 Subject: KVM: x86: Call mask notifiers from pic If pit delivers interrupt while pic is masking it OS will never do EOI and ack notifier will not be called so when pit will be unmasked no pit interrupts will be delivered any more. Calling mask notifiers solves this issue. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 819b748a33f..8d10c063d7f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -363,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } } else switch (s->init_state) { - case 0: /* normal mode */ + case 0: { /* normal mode */ + u8 imr_diff = s->imr ^ val, + off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (imr_diff & (1 << irq)) + kvm_fire_mask_notifiers( + s->pics_state->kvm, + SELECT_PIC(irq + off), + irq + off, + !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; + } case 1: s->irq_base = val & 0xf8; s->init_state = 2; -- cgit v1.2.3 From c19b8bd60e19308d5583ef200ddcc782d85d9543 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 15 Jul 2010 08:51:58 +0800 Subject: KVM: x86 emulator: fix xchg instruction emulation If the destination is a memory operand and the memory cannot map to a valid page, the xchg instruction emulation and locked instruction will not work on io regions and stuck in endless loop. We should emulate exchange as write to fix it. Signed-off-by: Wei Yongjun Acked-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 689c2c3182a..97aab036dab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3562,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + goto emul_write; + } kaddr = kmap_atomic(page, KM_USER0); kaddr += offset_in_page(gpa); -- cgit v1.2.3 From 6e3e243c3b6e0bbd18c6ce0fbc12bc3fe2d77b34 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 16 Jul 2010 11:52:55 +0200 Subject: KVM: MMU: fix mmu notifier invalidate handler for huge spte The index wasn't calculated correctly (off by one) for huge spte so KVM guest was unstable with transparent hugepages. Signed-off-by: Andrea Arcangeli Reviewed-by: Reviewed-by: Rik van Riel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 36c62f33513..812770cddc8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -850,8 +850,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int sh; + + sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); + idx = ((memslot->base_gfn+gfn_offset) >> sh) - + (memslot->base_gfn >> sh); ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); -- cgit v1.2.3 From fa1de2bfc0feb7245328ad25fb3e6d5cd2c903b4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:19:51 +0800 Subject: KVM: MMU: add missing reserved bits check in speculative path In the speculative path, we should check guest pte's reserved bits just as the real processor does Reported-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 ++++++++- arch/x86/kvm/paging_tmpl.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 812770cddc8..d2ea9cabc06 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2697,6 +2697,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } + if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + return; + ++vcpu->kvm->stat.mmu_pte_updated; if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); @@ -2775,6 +2778,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); @@ -2849,6 +2853,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -2896,7 +2901,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (gentry) + if (gentry && + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + & mask.word)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d9a2742014e..51ef9097960 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -638,8 +638,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (gfn != sp->gfns[i] || - !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + || gfn != sp->gfns[i] || !is_present_gpte(gpte) + || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; if (is_present_gpte(gpte) || !clear_unsync) -- cgit v1.2.3 From daa3db693ce925a14b7e17ab6f306dc0e6a5342c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:23:04 +0800 Subject: KVM: MMU: fix broken page accessed tracking with ept enabled In current code, if ept is enabled(shadow_accessed_mask = 0), the page accessed tracking is lost. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2ea9cabc06..9b3b916ebea 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -687,7 +687,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); - if (old_spte & shadow_accessed_mask) + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); @@ -815,7 +815,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, kvm_set_pfn_dirty(spte_to_pfn(*spte)); old_spte = __xchg_spte(spte, new_spte); if (is_shadow_present_pte(old_spte) - && (old_spte & shadow_accessed_mask)) + && (!shadow_accessed_mask || + old_spte & shadow_accessed_mask)) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } -- cgit v1.2.3 From 9ed5520dd3c9cb79c25f95fce9c57b87637d0fb7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:25:17 +0800 Subject: KVM: MMU: fix page dirty tracking lost while sync page In sync-page path, if spte.writable is changed, it will lose page dirty tracking, for example: assume spte.writable = 0 in a unsync-page, when it's synced, it map spte to writable(that is spte.writable = 1), later guest write spte.gfn, it means spte.gfn is dirty, then guest changed this mapping to read-only, after it's synced, spte.writable = 0 So, when host release the spte, it detect spte.writable = 0 and not mark page dirty Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b3b916ebea..a04756a26fe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1985,6 +1985,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: + if (is_writable_pte(*sptep) && !is_writable_pte(spte)) + kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; @@ -1998,7 +2000,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -2048,15 +2049,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); - } else { - if (was_writable) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); } + kvm_release_pfn_clean(pfn); if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; -- cgit v1.2.3 From be233d49ea8c1fde9f4afec378dc2c2f16ab0263 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:27:10 +0800 Subject: KVM: MMU: don't atomicly set spte if it's not present If the old mapping is not present, the spte.a is not lost, so no need atomic operation to set it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a04756a26fe..9c7fae08291 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -307,9 +307,10 @@ static void update_spte(u64 *sptep, u64 new_spte) { u64 old_spte; - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || + !is_rmap_spte(*sptep)) __set_spte(sptep, new_spte); - } else { + else { old_spte = __xchg_spte(sptep, new_spte); if (old_spte & shadow_accessed_mask) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); -- cgit v1.2.3 From e4b502ead259fcf70839414abb7c8cdc3b523f01 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:28:09 +0800 Subject: KVM: MMU: cleanup spte set and accssed/dirty tracking Introduce set_spte_track_bits() to cleanup current code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c7fae08291..e4b862eb888 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -679,7 +679,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; u64 old_spte; @@ -692,6 +692,11 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + set_spte_track_bits(sptep, new_spte); rmap_remove(kvm, sptep); } @@ -791,7 +796,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte, old_spte; + u64 *spte, new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -812,13 +817,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(spte_to_pfn(*spte)); - old_spte = __xchg_spte(spte, new_spte); - if (is_shadow_present_pte(old_spte) - && (!shadow_accessed_mask || - old_spte & shadow_accessed_mask)) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + set_spte_track_bits(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } -- cgit v1.2.3 From 9a3aad70572c3f4d55e7f09ac4eb313d41d0a484 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:30:18 +0800 Subject: KVM: MMU: using __xchg_spte more smarter Sometimes, atomically set spte is not needed, this patch call __xchg_spte() more smartly Note: if the old mapping's access bit is already set, we no need atomic operation since the access bit is not lost Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e4b862eb888..0dcc95e0987 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -682,9 +682,14 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; - u64 old_spte; + u64 old_spte = *sptep; + + if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || + old_spte & shadow_accessed_mask) { + __set_spte(sptep, new_spte); + } else + old_spte = __xchg_spte(sptep, new_spte); - old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); -- cgit v1.2.3 From 3444d7da1839b851eefedd372978d8a982316c36 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 18:32:38 +0300 Subject: KVM: VMX: Fix host GDT.LIMIT corruption vmx does not restore GDT.LIMIT to the host value, instead it sets it to 64KB. This means host userspace can learn a few bits of host memory. Fix by reloading GDTR when we load other host state. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2fdcc9819f3..27a0222c294 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -185,6 +185,7 @@ static void kvm_cpu_vmxoff(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; @@ -871,6 +872,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (current_thread_info()->status & TS_USEDFPU) clts(); + load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1379,6 +1381,8 @@ static int hardware_enable(void *garbage) ept_sync_global(); } + store_gdt(&__get_cpu_var(host_gdt)); + return 0; } -- cgit v1.2.3 From e8c534ec068af1a0845aceda373a9bfd2de62030 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Tue, 27 Jul 2010 18:53:35 +0200 Subject: x86: Fix keeping track of AMD C1E Accomodate the original C1E-aware idle routine to the different times during boot when the BIOS enables C1E. While at it, remove the synthetic CPUID flag in favor of a single global setting which denotes C1E status on the system. [ hpa: changed c1e_enabled to be a bool; clarified cpu bit 3:21 comment ] Signed-off-by: Michal Schmidt LKML-Reference: <20100727165335.GA11630@aftab> Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner --- arch/x86/include/asm/acpi.h | 2 +- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/process.c | 8 +++++--- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aa2c39d968f..92091de1111 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + else if (c1e_detected) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 817aa316b18..0b205b8a430 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -89,7 +89,7 @@ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ + /* 21 available, was AMD_C1E */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d85637bb950..325b7bdbeba 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -762,6 +762,7 @@ extern void init_c1e_mask(void); extern unsigned long boot_option_idle_override; extern unsigned long idle_halt; extern unsigned long idle_nomwait; +extern bool c1e_detected; /* * on systems with caches, caches must be flashed as the absolute diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 553b02f1309..b944f89c4e6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,8 +525,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } +bool c1e_detected; +EXPORT_SYMBOL(c1e_detected); + static cpumask_var_t c1e_mask; -static int c1e_detected; void c1e_remove_cpu(int cpu) { @@ -548,12 +550,12 @@ static void c1e_idle(void) u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = 1; + c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } -- cgit v1.2.3 From fe96eb404e33b59bb39f7050205f7c56c1c7d686 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 18 Mar 2010 13:53:24 -0400 Subject: x86: Detect whether we should use Xen SWIOTLB. It is paramount that we call pci_xen_swiotlb_detect before pci_swiotlb_detect as both implementations use the 'swiotlb' and 'swiotlb_force' flags. The pci-xen_swiotlb_detect inhibits the swiotlb_force and swiotlb flag so that the native SWIOTLB implementation is not enabled when running under Xen. [since v1 changed two Cc's to Acked-by] Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Jeremy Fitzhardinge Acked-by: FUJITA Tomonori [http://lkml.org/lkml/2010/7/27/374] Cc: Albert Herranz Cc: Ian Campbell Cc: Thomas Gleixner Acked-by: "H. Peter Anvin" [conditional http://lkml.org/lkml/2010/8/2/324] Cc: x86@kernel.org Cc: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8b01d..9f07cfcbd3a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -13,6 +13,7 @@ #include #include #include +#include static int forbid_dac __read_mostly; @@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_swiotlb_detect()) + if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) goto out; gart_iommu_hole_init(); @@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); out: + pci_xen_swiotlb_init(); + pci_swiotlb_init(); } @@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); - if (swiotlb) { + if (swiotlb || xen_swiotlb) { printk(KERN_INFO "PCI-DMA: " "Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_print_info(); -- cgit v1.2.3 From be783a47214afc5a0aea9dafcbd9f1535ba05e94 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 2 Aug 2010 08:49:34 +0800 Subject: x86, vdso: Unmap vdso pages We mapped vdso pages but never unmapped them and the virtual address is lost after exiting from the function, so unmap vdso pages here. Signed-off-by: Shaohua Li LKML-Reference: <20100802004934.GA2505@sli10-desk.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869b814..80f23ed483e 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -67,6 +67,7 @@ static int __init init_vdso_vars(void) *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; #include "vextern.h" #undef VEXTERN + vunmap(vbase); return 0; oom: -- cgit v1.2.3 From 22a57f5896df218356bae6203dfaf04bcfd6c88c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 Aug 2010 15:34:44 -0700 Subject: x86, setup: Allow global variables and functions in the decompressor In order for global variables and functions to work in the decompressor, we need to fix up the GOT in assembly code. Signed-off-by: H. Peter Anvin LKML-Reference: <4C57382E.8050501@zytor.com> --- arch/x86/boot/compressed/head_32.S | 13 +++++++++++++ arch/x86/boot/compressed/head_64.S | 13 +++++++++++++ arch/x86/boot/compressed/vmlinux.lds.S | 6 ++++++ 3 files changed, 32 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index f543b70ffae..67a655a39ce 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -123,6 +123,19 @@ relocated: shrl $2, %ecx rep stosl +/* + * Adjust our own GOT + */ + leal _got(%ebx), %edx + leal _egot(%ebx), %ecx +1: + cmpl %ecx, %edx + jae 2f + addl %ebx, (%edx) + addl $4, %edx + jmp 1b +2: + /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index faff0dc9c06..52f85a196fa 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -279,6 +279,19 @@ relocated: shrq $3, %rcx rep stosq +/* + * Adjust our own GOT + */ + leaq _got(%rip), %rdx + leaq _egot(%rip), %rcx +1: + cmpq %rcx, %rdx + jae 2f + addq %rbx, (%rdx) + addq $8, %rdx + jmp 1b +2: + /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 5ddabceee12..34d047c9828 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -41,6 +41,12 @@ SECTIONS *(.rodata.*) _erodata = . ; } + .got : { + _got = .; + KEEP(*(.got.plt)) + KEEP(*(.got)) + _egot = .; + } .data : { _data = . ; *(.data) -- cgit v1.2.3 From f4ed2877b16e8146427306aea8819adac5c88374 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Aug 2010 02:17:31 -0700 Subject: x86, setup: reorganize the early console setup Separate early_serial_console from tty.c This allows for reuse of early_serial_console.c/string.c/printf.c/cmdline.c in boot/compressed/. -v2: according to hpa, don't include string.c etc -v3: compressed/misc.c must have early_serial_base as static, so move it back to tty.c for setup code Signed-off-by: Yinghai Lu LKML-Reference: <4C568D2B.205@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 8 +- arch/x86/boot/boot.h | 35 +++++---- arch/x86/boot/cmdline.c | 6 +- arch/x86/boot/early_serial_console.c | 139 +++++++++++++++++++++++++++++++++++ arch/x86/boot/isdigit.h | 21 ++++++ arch/x86/boot/printf.c | 4 +- arch/x86/boot/tty.c | 136 +--------------------------------- 7 files changed, 186 insertions(+), 163 deletions(-) create mode 100644 arch/x86/boot/early_serial_console.c create mode 100644 arch/x86/boot/isdigit.h (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ec749c2bfdd..f7cb086b4ad 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o -setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o regs.o string.o tty.o video.o video-mode.o -setup-y += version.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o +setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o +setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o +setup-y += video-mode.o version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 46c4c5c71af..00cf51cfc2e 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -200,21 +200,7 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) return diff; } -static inline int isdigit(int ch) -{ - return (ch >= '0') && (ch <= '9'); -} - -static inline int isxdigit(int ch) -{ - if (isdigit(ch)) - return true; - - if ((ch >= 'a') && (ch <= 'f')) - return true; - - return (ch >= 'A') && (ch <= 'F'); -} +#include "isdigit.h" /* Heap -- available for dynamic lists. */ extern char _end[]; @@ -300,8 +286,18 @@ struct biosregs { void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); /* cmdline.c */ -int cmdline_find_option(const char *option, char *buffer, int bufsize); -int cmdline_find_option_bool(const char *option); +int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); +int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); +static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) +{ + return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); +} + +static inline int cmdline_find_option_bool(const char *option) +{ + return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); +} + /* cpu.c, cpucheck.c */ struct cpu_features { @@ -313,6 +309,10 @@ extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); int validate_cpu(void); +/* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); + /* edd.c */ void query_edd(void); @@ -348,7 +348,6 @@ unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); /* tty.c */ -void console_init(void); void puts(const char *); void putchar(int); int getchar(void); diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index a1d35634bce..6b3b6f708c0 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -27,9 +27,8 @@ static inline int myisspace(u8 c) * Returns the length of the argument (regardless of if it was * truncated to fit in the buffer), or -1 on not found. */ -int cmdline_find_option(const char *option, char *buffer, int bufsize) +int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) { - u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; addr_t cptr; char c; int len = -1; @@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize) * Returns the position of that option (starts counting with 1) * or 0 on not found */ -int cmdline_find_option_bool(const char *option) +int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) { - u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; addr_t cptr; char c; int pos = 0, wstart = 0; diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c new file mode 100644 index 00000000000..030f4b93e25 --- /dev/null +++ b/arch/x86/boot/early_serial_console.c @@ -0,0 +1,139 @@ +#include "boot.h" + +#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +#define DEFAULT_BAUD 9600 + +static void early_serial_init(int port, int baud) +{ + unsigned char c; + unsigned divisor; + + outb(0x3, port + LCR); /* 8n1 */ + outb(0, port + IER); /* no interrupt */ + outb(0, port + FCR); /* no fifo */ + outb(0x3, port + MCR); /* DTR + RTS */ + + divisor = 115200 / baud; + c = inb(port + LCR); + outb(c | DLAB, port + LCR); + outb(divisor & 0xff, port + DLL); + outb((divisor >> 8) & 0xff, port + DLH); + outb(c & ~DLAB, port + LCR); + + early_serial_base = port; +} + +static void parse_earlyprintk(void) +{ + int baud = DEFAULT_BAUD; + char arg[32]; + int pos = 0; + int port = 0; + + if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { + char *e; + + if (!strncmp(arg, "serial", 6)) { + port = DEFAULT_SERIAL_PORT; + pos += 6; + } + + if (arg[pos] == ',') + pos++; + + if (!strncmp(arg, "ttyS", 4)) { + static const int bases[] = { 0x3f8, 0x2f8 }; + int idx = 0; + + if (!strncmp(arg + pos, "ttyS", 4)) + pos += 4; + + if (arg[pos++] == '1') + idx = 1; + + port = bases[idx]; + } + + if (arg[pos] == ',') + pos++; + + baud = simple_strtoull(arg + pos, &e, 0); + if (baud == 0 || arg + pos == e) + baud = DEFAULT_BAUD; + } + + if (port) + early_serial_init(port, baud); +} + +#define BASE_BAUD (1843200/16) +static unsigned int probe_baud(int port) +{ + unsigned char lcr, dll, dlh; + unsigned int quot; + + lcr = inb(port + LCR); + outb(lcr | DLAB, port + LCR); + dll = inb(port + DLL); + dlh = inb(port + DLH); + outb(lcr, port + LCR); + quot = (dlh << 8) | dll; + + return BASE_BAUD / quot; +} + +static void parse_console_uart8250(void) +{ + char optstr[64], *options; + int baud = DEFAULT_BAUD; + int port = 0; + + /* + * console=uart8250,io,0x3f8,115200n8 + * need to make sure it is last one console ! + */ + if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) + return; + + options = optstr; + + if (!strncmp(options, "uart8250,io,", 12)) + port = simple_strtoull(options + 12, &options, 0); + else if (!strncmp(options, "uart,io,", 8)) + port = simple_strtoull(options + 8, &options, 0); + else + return; + + if (options && (options[0] == ',')) + baud = simple_strtoull(options + 1, &options, 0); + else + baud = probe_baud(port); + + if (port) + early_serial_init(port, baud); +} + +void console_init(void) +{ + parse_earlyprintk(); + + if (!early_serial_base) + parse_console_uart8250(); +} diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h new file mode 100644 index 00000000000..25e13403193 --- /dev/null +++ b/arch/x86/boot/isdigit.h @@ -0,0 +1,21 @@ +#ifndef BOOT_ISDIGIT_H + +#define BOOT_ISDIGIT_H + +static inline int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + +#endif diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 50e47cdbddd..cdac91ca55d 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -34,7 +34,7 @@ static int skip_atoi(const char **s) #define SMALL 32 /* Must be 32 == 0x20 */ #define SPECIAL 64 /* 0x */ -#define do_div(n,base) ({ \ +#define __do_div(n, base) ({ \ int __res; \ __res = ((unsigned long) n) % (unsigned) base; \ n = ((unsigned long) n) / (unsigned) base; \ @@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision, tmp[i++] = '0'; else while (num != 0) - tmp[i++] = (digits[do_div(num, base)] | locase); + tmp[i++] = (digits[__do_div(num, base)] | locase); if (i > precision) precision = i; size -= precision; diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index ff4b27a0fc5..def2451f46a 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -15,27 +15,12 @@ #include "boot.h" -#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ - -static int early_serial_base; +int early_serial_base; #define XMTRDY 0x20 -#define DLAB 0x80 - #define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ #define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -#define DEFAULT_BAUD 9600 /* * These functions are in .inittext so they can be used to signal @@ -152,122 +137,3 @@ int getchar_timeout(void) return 0; /* Timeout! */ } -static void early_serial_init(int port, int baud) -{ - unsigned char c; - unsigned divisor; - - outb(0x3, port + LCR); /* 8n1 */ - outb(0, port + IER); /* no interrupt */ - outb(0, port + FCR); /* no fifo */ - outb(0x3, port + MCR); /* DTR + RTS */ - - divisor = 115200 / baud; - c = inb(port + LCR); - outb(c | DLAB, port + LCR); - outb(divisor & 0xff, port + DLL); - outb((divisor >> 8) & 0xff, port + DLH); - outb(c & ~DLAB, port + LCR); - - early_serial_base = port; - - printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud); -} - -static void parse_earlyprintk(void) -{ - int baud = DEFAULT_BAUD; - char arg[32]; - int pos = 0; - int port = 0; - - if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { - char *e; - - if (!strncmp(arg, "serial", 6)) { - port = DEFAULT_SERIAL_PORT; - pos += 6; - } - - if (arg[pos] == ',') - pos++; - - if (!strncmp(arg, "ttyS", 4)) { - static const int bases[] = { 0x3f8, 0x2f8 }; - int idx = 0; - - if (!strncmp(arg + pos, "ttyS", 4)) - pos += 4; - - if (arg[pos++] == '1') - idx = 1; - - port = bases[idx]; - } - - if (arg[pos] == ',') - pos++; - - baud = simple_strtoull(arg + pos, &e, 0); - if (baud == 0 || arg + pos == e) - baud = DEFAULT_BAUD; - } - - if (port) - early_serial_init(port, baud); -} - -#define BASE_BAUD (1843200/16) -static unsigned int probe_baud(int port) -{ - unsigned char lcr, dll, dlh; - unsigned int quot; - - lcr = inb(port + LCR); - outb(lcr | DLAB, port + LCR); - dll = inb(port + DLL); - dlh = inb(port + DLH); - outb(lcr, port + LCR); - quot = (dlh << 8) | dll; - - return BASE_BAUD / quot; -} - -static void parse_console_uart8250(void) -{ - char optstr[64], *options; - int baud = DEFAULT_BAUD; - int port = 0; - - /* - * console=uart8250,io,0x3f8,115200n8 - * need to make sure it is last one console ! - */ - if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) - return; - - options = optstr; - - if (!strncmp(options, "uart8250,io,", 12)) - port = simple_strtoull(options + 12, &options, 0); - else if (!strncmp(options, "uart,io,", 8)) - port = simple_strtoull(options + 8, &options, 0); - else - return; - - if (options && (options[0] == ',')) - baud = simple_strtoull(options + 1, &options, 0); - else - baud = probe_baud(port); - - if (port) - early_serial_init(port, baud); -} - -void console_init(void) -{ - parse_earlyprintk(); - - if (!early_serial_base) - parse_console_uart8250(); -} -- cgit v1.2.3 From 9f242dc10e0c3c1eb32d8c83c18650a35fd7f80d Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 2 Aug 2010 16:10:37 -0700 Subject: x86, vmware: Preset lpj values when on VMware. When running on VMware's platform, we have seen situations where the AP's try to calibrate the lpj values and fail to get good calibration runs becasue of timing issues. As a result delays don't work correctly on all cpus. The solutions is to set preset_lpj value based on the current tsc frequency value. This is similar to what KVM does as well. Signed-off-by: Alok N Kataria LKML-Reference: <1280790637.14933.29.camel@ank32.eng.vmware.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff58844..227b0448960 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -51,7 +51,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } -- cgit v1.2.3 From 8fee13a48e4879fba57725f6d9513df4bfa8e9f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Aug 2010 16:21:22 -0700 Subject: x86, setup: enable early console output from the decompressor This enables the decompressor output to be seen on the serial console. Most of the code is shared with the regular boot code. We could add printf to the decompressor if needed, but currently there is no sufficiently compelling user. -v2: define BOOT_BOOT_H to avoid include boot.h -v3: early_serial_base need to be static in misc.c ? -v4: create seperate string.c printf.c cmdline.c early_serial_console.c after hpa's patch that allow global variables in compressed/misc stage -v5: remove printf.c related Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 4 +- arch/x86/boot/compressed/cmdline.c | 21 ++++++++++ arch/x86/boot/compressed/early_serial_console.c | 5 +++ arch/x86/boot/compressed/misc.c | 56 +++++++++++++++---------- arch/x86/boot/compressed/misc.h | 38 +++++++++++++++++ arch/x86/boot/compressed/string.c | 4 ++ arch/x86/boot/main.c | 6 +-- 7 files changed, 105 insertions(+), 29 deletions(-) create mode 100644 arch/x86/boot/compressed/cmdline.c create mode 100644 arch/x86/boot/compressed/early_serial_console.c create mode 100644 arch/x86/boot/compressed/misc.h create mode 100644 arch/x86/boot/compressed/string.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fbb47daf245..0c229551eea 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c new file mode 100644 index 00000000000..cb62f786990 --- /dev/null +++ b/arch/x86/boot/compressed/cmdline.c @@ -0,0 +1,21 @@ +#include "misc.h" + +static unsigned long fs; +static inline void set_fs(unsigned long seg) +{ + fs = seg << 4; /* shift it back */ +} +typedef unsigned long addr_t; +static inline char rdfs8(addr_t addr) +{ + return *((char *)(fs + addr)); +} +#include "../cmdline.c" +int cmdline_find_option(const char *option, char *buffer, int bufsize) +{ + return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); +} +int cmdline_find_option_bool(const char *option) +{ + return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); +} diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c new file mode 100644 index 00000000000..261e81fb958 --- /dev/null +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -0,0 +1,5 @@ +#include "misc.h" + +int early_serial_base; + +#include "../early_serial_console.c" diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 51e240779a4..8f7bef8e9ff 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -9,23 +9,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -/* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening - */ -#undef CONFIG_PARAVIRT -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif - -#include -#include -#include -#include -#include -#include -#include +#include "misc.h" /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -123,15 +107,13 @@ static void error(char *m); /* * This is set up by the setup-routine at boot-time */ -static struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *real_mode; /* Pointer to real-mode data */ static int quiet; +static int debug; void *memset(void *s, int c, size_t n); void *memcpy(void *dest, const void *src, size_t n); -static void __putstr(int, const char *); -#define putstr(__x) __putstr(0, __x) - #ifdef CONFIG_X86_64 #define memptr long #else @@ -170,7 +152,21 @@ static void scroll(void) vidmem[i] = ' '; } -static void __putstr(int error, const char *s) +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ +static void serial_putchar(int ch) +{ + unsigned timeout = 0xffff; + + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + + outb(ch, early_serial_base + TXR); +} + +void __putstr(int error, const char *s) { int x, y, pos; char c; @@ -179,6 +175,14 @@ static void __putstr(int error, const char *s) if (!error) return; #endif + if (early_serial_base) { + const char *str = s; + while (*str) { + if (*str == '\n') + serial_putchar('\r'); + serial_putchar(*str++); + } + } if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) @@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; - if (real_mode->hdr.loadflags & QUIET_FLAG) + if (cmdline_find_option_bool("quiet")) quiet = 1; + if (cmdline_find_option_bool("debug")) + debug = 1; if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; @@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, lines = real_mode->screen_info.orig_video_lines; cols = real_mode->screen_info.orig_video_cols; + console_init(); + if (debug) + putstr("early console in decompress_kernel\n"); + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h new file mode 100644 index 00000000000..a267849ac1c --- /dev/null +++ b/arch/x86/boot/compressed/misc.h @@ -0,0 +1,38 @@ +#ifndef BOOT_COMPRESSED_MISC_H +#define BOOT_COMPRESSED_MISC_H + +/* + * we have to be careful, because no indirections are allowed here, and + * paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening + */ +#undef CONFIG_PARAVIRT +#ifdef CONFIG_X86_32 +#define _ASM_X86_DESC_H 1 +#endif + +#include +#include +#include +#include +#include +#include +#include + +#define BOOT_BOOT_H + +/* misc.c */ +extern struct boot_params *real_mode; /* Pointer to real-mode data */ +void __putstr(int error, const char *s); +#define putstr(__x) __putstr(0, __x) +#define puts(__x) __putstr(0, __x) + +/* cmdline.c */ +int cmdline_find_option(const char *option, char *buffer, int bufsize); +int cmdline_find_option_bool(const char *option); + +/* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); + +#endif diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c new file mode 100644 index 00000000000..7995c6a4950 --- /dev/null +++ b/arch/x86/boot/compressed/string.c @@ -0,0 +1,4 @@ +#include "misc.h" + +#include "../isdigit.h" +#include "../string.c" diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 4ef1a33e857..40358c8905b 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -132,6 +132,8 @@ void main(void) /* Initialize the early-boot console */ console_init(); + if (cmdline_find_option_bool("debug")) + puts("early console in setup code\n"); /* End of heap check */ init_heap(); @@ -171,10 +173,6 @@ void main(void) /* Set the video mode */ set_video(); - /* Parse command line for 'quiet' and pass it to decompressor. */ - if (cmdline_find_option_bool("quiet")) - boot_params.hdr.loadflags |= QUIET_FLAG; - /* Do the last things and invoke protected mode */ go_to_protected_mode(); } -- cgit v1.2.3 From 6238b47b58480cd9c092600c05338dbe261b71ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 Aug 2010 21:03:46 -0700 Subject: x86, setup: move isdigit.h to ctype.h, header files on top. It is a subset of functionality, so name it ctype.h. Also, reorganize header files so #include statements are clustered near the top as they should be. Signed-off-by: H. Peter Anvin LKML-Reference: <4C5752F2.8030206@kernel.org> --- arch/x86/boot/boot.h | 3 +-- arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/compressed/string.c | 2 -- arch/x86/boot/ctype.h | 21 +++++++++++++++++++++ arch/x86/boot/isdigit.h | 21 --------------------- 5 files changed, 23 insertions(+), 25 deletions(-) create mode 100644 arch/x86/boot/ctype.h delete mode 100644 arch/x86/boot/isdigit.h (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 00cf51cfc2e..c7093bd9f2d 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -28,6 +28,7 @@ #include "bitops.h" #include #include +#include "ctype.h" /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -200,8 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) return diff; } -#include "isdigit.h" - /* Heap -- available for dynamic lists. */ extern char _end[]; extern char *HEAP; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a267849ac1c..3f19c81a620 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -20,6 +20,7 @@ #include #define BOOT_BOOT_H +#include "../ctype.h" /* misc.c */ extern struct boot_params *real_mode; /* Pointer to real-mode data */ diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 7995c6a4950..19b3e693cd7 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,4 +1,2 @@ #include "misc.h" - -#include "../isdigit.h" #include "../string.c" diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h new file mode 100644 index 00000000000..25e13403193 --- /dev/null +++ b/arch/x86/boot/ctype.h @@ -0,0 +1,21 @@ +#ifndef BOOT_ISDIGIT_H + +#define BOOT_ISDIGIT_H + +static inline int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + +#endif diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h deleted file mode 100644 index 25e13403193..00000000000 --- a/arch/x86/boot/isdigit.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef BOOT_ISDIGIT_H - -#define BOOT_ISDIGIT_H - -static inline int isdigit(int ch) -{ - return (ch >= '0') && (ch <= '9'); -} - -static inline int isxdigit(int ch) -{ - if (isdigit(ch)) - return true; - - if ((ch >= 'a') && (ch <= 'f')) - return true; - - return (ch >= 'A') && (ch <= 'F'); -} - -#endif -- cgit v1.2.3 From 35f2915c3bd0cd6950bdd9d461de565e8feae852 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 1 Jun 2010 13:07:34 +0100 Subject: intel_scu_ipc: add definitions for vRTC related command Signed-off-by: Feng Tang Signed-off-by: Alan Cox Signed-off-by: Matthew Garrett --- arch/x86/include/asm/intel_scu_ipc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 4470c9ad4a3..03200452069 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -1,6 +1,12 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ +#define IPCMSG_VRTC 0xFA /* Set vRTC device */ + +/* Command id associated with message IPCMSG_VRTC */ +#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ +#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ + /* Read single register */ int intel_scu_ipc_ioread8(u16 addr, u8 *data); -- cgit v1.2.3 From 804f8681a99da2aa49bd7f0dab3750848d1ab1bc Mon Sep 17 00:00:00 2001 From: Sreedhara DS Date: Mon, 26 Jul 2010 10:03:10 +0100 Subject: Remove indirect read write api support. The firmware of production devices does not support this interface so this is dead code. Signed-off-by: Sreedhara DS Signed-off-by: Alan Cox Signed-off-by: Matthew Garrett --- arch/x86/include/asm/intel_scu_ipc.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 03200452069..29f66793cc5 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -34,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); /* Update single register based on the mask */ int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); -/* - * Indirect register read - * Can be used when SCCB(System Controller Configuration Block) register - * HRIM(Honor Restricted IPC Messages) is set (bit 23) - */ -int intel_scu_ipc_register_read(u32 addr, u32 *data); - -/* - * Indirect register write - * Can be used when SCCB(System Controller Configuration Block) register - * HRIM(Honor Restricted IPC Messages) is set (bit 23) - */ -int intel_scu_ipc_register_write(u32 addr, u32 data); - /* Issue commands to the SCU with or without data */ int intel_scu_ipc_simple_command(int cmd, int sub); int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, -- cgit v1.2.3 From 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 18 May 2010 13:59:05 +0200 Subject: x86, mce: Notify about corrected events too Notify all parties registered on the mce decoder chain about logged correctable MCEs. Signed-off-by: Borislav Petkov Acked-by: Doug Thompson Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc4256225..1970ef911c9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); add_taint(TAINT_MACHINE_CHECK); } -- cgit v1.2.3 From 5d77b85458f656923b85291a4ff56ed44859ed52 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 20 Jul 2010 13:52:00 -0400 Subject: [CPUFREQ] pcc driver should check for pcch method before calling _OSC The pcc specification documents an _OSC method that's incompatible with the one defined as part of the ACPI spec. This shouldn't be a problem as both are supposed to be guarded with a UUID. Unfortunately approximately nobody (including HP, who wrote this spec) properly check the UUID on entry to the _OSC call. Right now this could result in surprising behaviour if the pcc driver performs an _OSC call on a machine that doesn't implement the pcc specification. Check whether the PCCH method exists first in order to reduce this probability. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ce7cde713e7..01bd25c3c7c 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void) struct pcc_memory_resource *mem_resource; struct pcc_register_resource *reg_resource; union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle; + acpi_handle handle, osc_handle, pcch_handle; int ret = 0; status = acpi_get_handle(NULL, "\\_SB", &handle); if (ACPI_FAILURE(status)) return -ENODEV; + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + status = acpi_get_handle(handle, "_OSC", &osc_handle); if (ACPI_SUCCESS(status)) { ret = pcc_cpufreq_do_osc(&osc_handle); -- cgit v1.2.3 From 0d9715d64fe118dd0957a29e344972b8d3f960e7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 23 Jul 2010 23:06:52 +0100 Subject: [CPUFREQ] fix double freeing in error path of pcc-cpufreq Prevent double freeing on error path. Signed-off-by: Daniel J Blueman Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 01bd25c3c7c..900702888bf 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } + if (out_obj->type != ACPI_TYPE_BUFFER) + return -ENODEV; errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } + if (errors) + return -ENODEV; supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } + if (!(supported & 0x1)) + return -ENODEV; out_free: kfree(output.pointer); -- cgit v1.2.3 From 6ebdf777ba034d2b54c99f28a4b18dabf286d8e5 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 15 Jul 2010 11:44:00 -0400 Subject: [CPUFREQ] Fix PCC driver error path The PCC cpufreq driver unmaps the mailbox address range if any CPUs fail to initialise, but doesn't do anything to remove the registered CPUs from the cpufreq core resulting in failures further down the line. We're better off simply returning a failure - the cpufreq core will unregister us cleanly if we end up with no successfully registered CPUs. Tidy up the failure path and also add a sanity check to ensure that the firmware gives us a realistic frequency - the core deals badly with that being set to 0. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 900702888bf..a36de5bbb62 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -541,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!pcch_virt_addr) { result = -1; - goto pcch_null; + goto out; } result = pcc_get_offset(cpu); if (result) { dprintk("init: PCCP evaluation failed\n"); - goto free; + goto out; } policy->max = policy->cpuinfo.max_freq = @@ -556,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) ioread32(&pcch_hdr->minimum_frequency) * 1000; policy->cur = pcc_get_freq(cpu); + if (!policy->cur) { + dprintk("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + dprintk("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); - - return 0; -free: - pcc_clear_mapping(); - free_percpu(pcc_cpu_info); -pcch_null: +out: return result; } -- cgit v1.2.3 From c2f4a2c6e08c7635316dfd25ef706e9104384c56 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 17:55:30 +0200 Subject: [CPUFREQ] powernow-k8: Limit Pstate transition latency check The Pstate transition latency check was added for broken F10h BIOSen which wrongly contain a value of 0 for transition and bus master latency. Fam11h and later, however, (will) have similar transition latency so extend that behavior for them too. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e..3e90cce3dc8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1023,13 +1023,12 @@ static int get_transition_latency(struct powernow_k8_data *data) } if (max_latency == 0) { /* - * Fam 11h always returns 0 as transition latency. - * This is intended and means "very fast". While cpufreq core - * and governors currently can handle that gracefully, better - * set it to 1 to avoid problems in the future. - * For all others it's a BIOS bug. + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. */ - if (boot_cpu_data.x86 != 0x11) + if (boot_cpu_data.x86 < 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; -- cgit v1.2.3 From 298decfbc44e9a4cb7862ae1b7dfc4e1ba3551b9 Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Wed, 20 Jan 2010 19:19:33 +0200 Subject: [CPUFREQ] powernow-k8: On load failure, remind the user to enable support in BIOS setup On Wed, 2010-01-20 at 16:56 +0100, Thomas Renninger wrote: > But most often this happens if people upgrade their CPU and do not > update their BIOS. > Or the vendor does not recognise the new CPU even if the BIOS got > updated. Maybe some of those people just didn't realize it was disabled in BIOS? If you tell users that it's a firmware bug then they'll probably just give up. > The itself message might be an enhancment, IMO it's not worth a patch. Why do you think so? I spent an hour on hunting down the BIOS upgrade, only to find that it didn't improve anything. It was a day later that I realized that it might be a BIOS option; and the option was literally the _last_ option in the whole BIOS setup. :) This message would have saved the day. > But do not revert the FW_BUG part! Sure, you have a point here. How about this patch? --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3e90cce3dc8..c48b44b3b43 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) * www.amd.com */ printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); return -ENODEV; } -- cgit v1.2.3 From 6b72e3934b42930fd40fc42fe762d21be413301c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:35 +0200 Subject: [CPUFREQ] acpi-cpufreq: Fix CPU_ANY CPUFREQ_{PRE,POST}CHANGE notification Signed-off-by: Thomas Renninger CC: venki@google.com CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40e..cee7aa949c3 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -351,7 +351,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -367,7 +367,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } -- cgit v1.2.3 From 6f4f2723d08534fd4e407e1ef8500b0f4d12c30c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:36 +0200 Subject: [CPUFREQ] x86 cpufreq: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id Signed-off-by: Thomas Renninger CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org CC: robert.schoene@tu-dresden.de Tested-by: robert.schoene@tu-dresden.de Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cee7aa949c3..246cd3afbb5 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32..787572d43d9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit v1.2.3 From ccc5638a20b0eb3a66666d9d4dd8fe8f5ad40386 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 20:03:55 +0400 Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e..8c3325fee77 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -199,7 +199,7 @@ static __init struct pci_dev *gx_detect_chipset(void) } /* detect which companion chip is used */ - while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + for_each_pci_dev(gx_pci) { if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) return gx_pci; } -- cgit v1.2.3 From 55c789bb2bcdcaa8f1f60687b4a9dbd02ffddd88 Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Thu, 15 Jul 2010 20:36:41 +0200 Subject: [CPUFREQ] Convert pci_table entries to PCI_VDEVICE (if PCI_ANY_ID is used) This patch converts pci_table entries, where .subvendor=PCI_ANY_ID and .subdevice=PCI_ANY_ID, .class=0 and .class_mask=0, to use the PCI_VDEVICE macro, and thus improves readability. Signed-off-by: Peter Huewe Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 8c3325fee77..32974cf8423 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { * Low Level chipset interface * ****************************************************************/ static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, - PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, { 0, }, }; -- cgit v1.2.3 From b30d3304c9c068ccfe6940232834768af75f8c9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 18:05:14 +0200 Subject: [CPUFREQ] powernow-k8: Fix misleading variable naming rdmsr() takes the lower 32 bits as a second argument and the high 32 as a third. Fix the names accordingly since they were swapped. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c48b44b3b43..90cab2d4ac0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -912,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, { int i; u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); - data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; -- cgit v1.2.3 From 7e2d81122052c83feeddbebf706b6d53fba7996d Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:28:49 +0800 Subject: [CPUFREQ] Fix section mismatch for longrun_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longrun.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c5..fc09f142d94 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) +static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) { u32 msr_lo, msr_hi; u32 save_lo, save_hi; @@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, } -static int __init longrun_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) { int result = 0; -- cgit v1.2.3 From 2530573e45c5846cd238db78651f0d236fc78aab Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:03 +0800 Subject: [CPUFREQ] Fix section mismatch for longhaul_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. Use the __cpuinitdata for data used by the routines marked as __cpuinit. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/longhaul.h | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f826..03162dac627 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -426,7 +426,7 @@ static int guess_fsb(int mult) } -static int __init longhaul_get_ranges(void) +static int __cpuinit longhaul_get_ranges(void) { unsigned int i, j, k = 0; unsigned int ratio; @@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) } -static void __init longhaul_setup_voltagescaling(void) +static void __cpuinit longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; struct mV_pos minvid, maxvid, vid; @@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) return 0; } -static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname = NULL; diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f7..cbf48fbca88 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h @@ -56,7 +56,7 @@ union msr_longhaul { /* * VIA C3 Samuel 1 & Samuel 2 (stepping 0) */ -static const int __initdata samuel1_mults[16] = { +static const int __cpuinitdata samuel1_mults[16] = { -1, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { -1, /* 1111 -> RESERVED */ }; -static const int __initdata samuel1_eblcr[16] = { +static const int __cpuinitdata samuel1_eblcr[16] = { 50, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { /* * VIA C3 Samuel2 Stepping 1->15 */ -static const int __initdata samuel2_eblcr[16] = { +static const int __cpuinitdata samuel2_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { /* * VIA C3 Ezra */ -static const int __initdata ezra_mults[16] = { +static const int __cpuinitdata ezra_mults[16] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { 120, /* 1111 -> 12.0x */ }; -static const int __initdata ezra_eblcr[16] = { +static const int __cpuinitdata ezra_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { /* * VIA C3 (Ezra-T) [C5M]. */ -static const int __initdata ezrat_mults[32] = { +static const int __cpuinitdata ezrat_mults[32] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { -1, /* 1111 -> RESERVED (12.0x) */ }; -static const int __initdata ezrat_eblcr[32] = { +static const int __cpuinitdata ezrat_eblcr[32] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static const int __initdata nehemiah_mults[32] = { +static const int __cpuinitdata nehemiah_mults[32] = { 100, /* 0000 -> 10.0x */ -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { -1, /* 1111 -> 12.0x */ }; -static const int __initdata nehemiah_eblcr[32] = { +static const int __cpuinitdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -315,7 +315,7 @@ struct mV_pos { unsigned short pos; }; -static const struct mV_pos __initdata vrm85_mV[32] = { +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, @@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} }; -static const unsigned char __initdata mV_vrm85[32] = { +static const unsigned char __cpuinitdata mV_vrm85[32] = { 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 }; -static const struct mV_pos __initdata mobilevrm_mV[32] = { +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, @@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static const unsigned char __initdata mV_mobilevrm[32] = { +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, -- cgit v1.2.3 From 307069cf6c53632adc27de4f49bf5d1d67cb87bb Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:16 +0800 Subject: [CPUFREQ] Fix section mismatch for powernow_cpu_init in powernow-k7.c Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e..4a45fd6e41b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) * We will then get the same kind of behaviour already tested under * the "well-known" other OS. */ -static int __init fixup_sgtc(void) +static int __cpuinit fixup_sgtc(void) { unsigned int sgtc; unsigned int m; @@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(const struct dmi_system_id *d) +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING PFX "%s laptop with broken PST tables in BIOS detected.\n", @@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) * A BIOS update is all that can save them. * Mention this, and disable cpufreq. */ -static struct dmi_system_id __initdata powernow_dmi_table[] = { +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { { .callback = acer_cpufreq_pst, .ident = "Acer Aspire", @@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { { } }; -static int __init powernow_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) { union msr_fidvidstatus fidvidstatus; int result; -- cgit v1.2.3 From 9d1f44ee206a23b975d7d7c6f759efb25e0e61ac Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 3 Aug 2010 13:47:30 -0400 Subject: [CPUFREQ] Remove pointless printk from p4-clockmod. The only machines this is triggering on should be supported by acpi-cpufreq or acpi's internal throttling. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b0..bd1cac747f6 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ -- cgit v1.2.3 From cb84b19474384c572ba3aa2345815e555112ebf5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:43 -0700 Subject: x86, hwmon: Package Level Thermal/Power: pkgtemp hwmon driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a hwmon driver for package level thermal control. The driver dumps package level thermal information through sysfs interface so that upper level application (e.g. lm_sensor) can retrive the information. Instead of having the package level hwmon code in coretemp, I write a seperate driver pkgtemp because: First, package level thermal sensors include not only sensors for each core, but also sensors for uncore, memory controller or other components in the package. Logically it will be clear to have a seperate hwmon driver for package level hwmon to monitor wider range of sensors in a package. Merging package thermal driver into core thermal driver doesn't make sense and may mislead. Secondly, merging the two drivers together may cause coding mess. It's easier to include various package level sensors info if more sensor information is implemented. Coretemp code needs to consider a lot of legacy machine cases. Pkgtemp code only considers platform starting from Sandy Bridge. On a 1Sx4Cx2T Sandy Bridge platform, lm-sensors dumps the pkgtemp and coretemp: pkgtemp-isa-0000 Adapter: ISA adapter physical id 0: +33.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0000 Adapter: ISA adapter Core 0: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0001 Adapter: ISA adapter Core 1: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0002 Adapter: ISA adapter Core 2: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0003 Adapter: ISA adapter Core 3: +32.0°C (high = +79.0°C, crit = +99.0°C) [ hpa: folded v3 patch removing improper global variable "SHOW" ] Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-3-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 1 + arch/x86/configs/x86_64_defconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index d28fad19654..e3a32431ca1 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1471,6 +1471,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 6c86acd847a..4251f837205 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1456,6 +1456,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set -- cgit v1.2.3 From 55d435a227bd28c77afab326de44dfacc0b15059 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:44 -0700 Subject: x86, hwmon: Package Level Thermal/Power: thermal throttling handler Add package level thermal throttle interrupt support. The interrupt handler increases package level thermal throttle count. It also logs the event in MCE log. The package level thermal throttle interrupt happens across threads in a package. Each thread handles the interrupt individually. User level application is supposed to retrieve correct event count and log based on package/thread topology. This is the same situation for core level interrupt handler. In the future, interrupt may be reported only per package or per core. core_throttle_count and package_throttle_count are used for user interface. Previously only throttle_count is used for core throttle count. If you think new core_throttle_count name breaks user interface, I can change this part. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-4-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 +++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf971..d307f9f64c2 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -37,7 +37,7 @@ /* * Current thermal throttling state: */ -struct thermal_state { +struct _thermal_state { bool is_throttled; u64 next_check; @@ -45,6 +45,11 @@ struct thermal_state { unsigned long last_throttle_count; }; +struct thermal_state { + struct _thermal_state core; + struct _thermal_state package; +}; + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(level, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##level##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name( \ ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).level.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_one_ro(package_throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; @@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 +static int therm_throt_process(bool is_throttled, int level) { - struct thermal_state *state; + struct _thermal_state *state; unsigned int this_cpu; bool was_throttled; u64 now; this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) + state = &per_cpu(thermal_state, this_cpu).core; + else + state = &per_cpu(thermal_state, this_cpu).package; was_throttled = state->is_throttled; state->is_throttled = is_throttled; @@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled) /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled) /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PTS)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_throttle_attr_group.name); + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device); static void intel_thermal_interrupt(void) { __u64 msr_val; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + CORE_LEVEL) != 0) mce_log_therm_throt_event(msr_val); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + PACKAGE_LEVEL) != 0) + /* + * Set up the most significant bit to notify mce log + * that this thermal event is a package level event. + * This is a temp solution. May be changed in the future + * with mce log infrasture. + */ + mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + } } static void unexpected_thermal_interrupt(void) @@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); -- cgit v1.2.3 From 0199114c31798af5b83841b21759b64171060d9b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:45 -0700 Subject: x86, hwmon: Package Level Thermal/Power: power limit Power limit notification feature is published in Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification. It is implemented first on Intel Sandy Bridge platform. The patch handles notification interrupt. Interrupt handler dumps power limit information in log_buf, logs the event in mce log, and increases the event counters (core_power_limit and package_power_limit). Upper level applications could use the data to detect system health or diagnose functionality/performance issues. In the future, the event could be handled in a more fancy way. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-5-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 183 ++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d307f9f64c2..c2a8b26d4fe 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,25 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ struct _thermal_state { - bool is_throttled; - + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; }; struct thermal_state { - struct _thermal_state core; - struct _thermal_state package; + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly; therm_throt_sysdev_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(level, name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##level##_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).level.name); \ + per_cpu(thermal_state, cpu).event.name); \ } else \ ret = 0; \ preempt_enable(); \ @@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); define_therm_throt_sysdev_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); + static struct attribute *thermal_throttle_attrs[] = { &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -116,49 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 -static int therm_throt_process(bool is_throttled, int level) +static int therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - if (level == CORE_LEVEL) - state = &per_cpu(thermal_state, this_cpu).core; - else - state = &per_cpu(thermal_state, this_cpu).package; + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->throttle_count); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + else + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); add_taint(TAINT_MACHINE_CHECK); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + else + printk(KERN_INFO "CPU%d: %s power limit normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -172,21 +207,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) int err; struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) return err; + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, - thermal_throttle_attr_group.name); + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -257,6 +300,17 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { @@ -264,21 +318,31 @@ static void intel_thermal_interrupt(void) struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) - /* - * Set up the most significant bit to notify mce log - * that this thermal event is a package level event. - * This is a temp solution. May be changed in the future - * with mce log infrasture. - */ - mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); } } @@ -381,14 +445,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); } smp_thermal_vector = intel_thermal_interrupt; -- cgit v1.2.3 From 8a22b9996b001c88f2bfb54c6de6a05fc39e177a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 12 Jul 2010 11:49:59 -0700 Subject: xen: drop xen_sched_clock in favour of using plain wallclock time xen_sched_clock only counts unstolen time. In principle this should be useful to the Linux scheduler so that it knows how much time a process actually consumed. But in practice this doesn't work very well as the scheduler expects the sched_clock time to be synchronized between cpus. It also uses sched_clock to measure the time a task spends sleeping, in which case "unstolen time" isn't meaningful. So just use plain xen_clocksource_read to return wallclock nanoseconds for sched_clock. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/time.c | 39 --------------------------------------- 2 files changed, 1 insertion(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 399bed2de88..fef034a04c2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -926,7 +926,7 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, + .sched_clock = xen_clocksource_read, }; static const struct pv_cpu_ops xen_cpu_ops __initdata = { diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 32764b8880b..e90360ff4a0 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -155,45 +155,6 @@ static void do_stolen_accounting(void) account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { -- cgit v1.2.3 From c06ee78d73fd24e8d8a65f16380f6a0551107e1b Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 19 Jul 2010 10:25:08 -0700 Subject: xen: support large numbers of CPUs with vcpu info placement When vcpu info placement is supported, we're not limited to MAX_VIRT_CPUS vcpus. However, if it isn't supported, then ignore any excess vcpus. Signed-off-by: Mukesh Rathor Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fef034a04c2..90a3e802676 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -97,6 +97,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +static void clamp_max_cpus(void) +{ +#ifdef CONFIG_SMP + if (setup_max_cpus > MAX_VIRT_CPUS) + setup_max_cpus = MAX_VIRT_CPUS; +#endif +} + static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; @@ -104,13 +112,17 @@ static void xen_vcpu_setup(int cpu) struct vcpu_info *vcpup; BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - if (!have_vcpu_info_placement) - return; /* already tested, not available */ + if (cpu < MAX_VIRT_CPUS) + per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - vcpup = &per_cpu(xen_vcpu_info, cpu); + if (!have_vcpu_info_placement) { + if (cpu >= MAX_VIRT_CPUS) + clamp_max_cpus(); + return; + } + vcpup = &per_cpu(xen_vcpu_info, cpu); info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); @@ -125,6 +137,7 @@ static void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); have_vcpu_info_placement = 0; + clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if later ones fail to. */ -- cgit v1.2.3 From f09f6d194d85043e0eb105a577e7ad6d8170ab66 Mon Sep 17 00:00:00 2001 From: Donald Dutile Date: Thu, 15 Jul 2010 14:56:49 -0400 Subject: Xen: register panic notifier to take crashes of xen guests on panic Register a panic notifier so that when the guest crashes it can shut down the domain and indicate it was a crash to the host. Signed-off-by: Donald Dutile Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 20 ++++++++++++++++++++ arch/x86/xen/setup.c | 2 ++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 90a3e802676..d99522e8f03 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1040,6 +1040,26 @@ static void xen_crash_shutdown(struct pt_regs *regs) xen_reboot(SHUTDOWN_crash); } +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct sched_shutdown r = { .reason = SHUTDOWN_crash}; + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) + BUG(); + return NOTIFY_DONE; +} + +static struct notifier_block xen_panic_block = { + .notifier_call= xen_panic_event, +}; + +int xen_panic_handler_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + return 0; +} + static const struct machine_ops __initdata xen_machine_ops = { .restart = xen_restart, .halt = xen_machine_halt, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9deb6bab6c7..328b0030542 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -226,6 +226,8 @@ void __init xen_arch_setup(void) struct physdev_set_iopl set_iopl; int rc; + xen_panic_handler_init(); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bc..00d59d608ed 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -101,4 +101,6 @@ void xen_sysret32(void); void xen_sysret64(void); void xen_adjust_exception_frame(void); +extern int xen_panic_handler_init(void); + #endif /* XEN_OPS_H */ -- cgit v1.2.3 From 086748e52fb072ff0935ba4512e29c421bd5b716 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 3 Aug 2010 14:55:14 -0700 Subject: xen/panic: use xen_reboot and fix smp_send_stop Offline vcpu when using stop_self. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 5 +---- arch/x86/xen/smp.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d99522e8f03..3c4da8bee06 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1043,10 +1043,7 @@ static void xen_crash_shutdown(struct pt_regs *regs) static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct sched_shutdown r = { .reason = SHUTDOWN_crash}; - - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) - BUG(); + xen_reboot(SHUTDOWN_crash); return NOTIFY_DONE; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a29693fd313..25f232b18a8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -394,6 +394,8 @@ static void stop_self(void *v) load_cr3(swapper_pg_dir); /* should set up a minimal gdt */ + set_cpu_online(cpu, false); + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); BUG(); } -- cgit v1.2.3 From a7c55cbee0c1bae9bf5a15a08300e91d88706e45 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 4 Aug 2010 20:27:05 -0400 Subject: oprofile: add support for Intel processor model 30 Newer Intel processors identifying themselves as model 30 are not recognized by oprofile. model : 30 model name : Intel(R) Xeon(R) CPU X3470 @ 2.93GHz Running oprofile on these machines gives the following: + opcontrol --init + opcontrol --list-events oprofile: available events for CPU type "Intel Architectural Perfmon" See Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3B (Document 253669) Chapter 18 for architectural perfmon events This is a limited set of fallback events because oprofile doesn't know your CPU CPU_CLK_UNHALTED: (counter: all) Clock cycles when not halted (min count: 6000) INST_RETIRED: (counter: all) number of instructions retired (min count: 6000) LLC_MISSES: (counter: all) Last level cache demand requests from this core that missed the LLC (min count: 6000) Unit masks (default 0x41) ---------- 0x41: No unit mask LLC_REFS: (counter: all) Last level cache demand requests from this core (min count: 6000) Unit masks (default 0x4f) ---------- 0x4f: No unit mask BR_MISS_PRED_RETIRED: (counter: all) number of mispredicted branches retired (precise) (min count: 500) + opcontrol --shutdown Tested using oprofile 0.9.6. Signed-off-by: Josh Hunt Reviewed-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 1ba67dc8006..f6b48f6c595 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -668,6 +668,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 0x1a: + case 0x1e: case 0x2e: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; -- cgit v1.2.3 From 12bfa3de63504d879ae427ec1f2884fc46556157 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 5 Aug 2010 09:22:20 -0500 Subject: kgdb,x86: Individual register get/set for x86 Implement the ability to individually get and set registers for kdb and kgdb for x86. Signed-off-by: Jason Wessel Acked-by: H. Peter Anvin CC: Ingo Molnar CC: x86@kernel.org --- arch/x86/include/asm/kgdb.h | 20 +++--- arch/x86/kernel/kgdb.c | 168 ++++++++++++++++++++++---------------------- 2 files changed, 94 insertions(+), 94 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 006da3687cd..396f5b5fc4d 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -39,9 +39,11 @@ enum regnames { GDB_FS, /* 14 */ GDB_GS, /* 15 */ }; +#define GDB_ORIG_AX 41 +#define DBG_MAX_REG_NUM 16 #define NUMREGBYTES ((GDB_GS+1)*4) #else /* ! CONFIG_X86_32 */ -enum regnames64 { +enum regnames { GDB_AX, /* 0 */ GDB_BX, /* 1 */ GDB_CX, /* 2 */ @@ -59,15 +61,15 @@ enum regnames64 { GDB_R14, /* 14 */ GDB_R15, /* 15 */ GDB_PC, /* 16 */ + GDB_PS, /* 17 */ + GDB_CS, /* 18 */ + GDB_SS, /* 19 */ }; - -enum regnames32 { - GDB_PS = 34, - GDB_CS, - GDB_SS, -}; -#define NUMREGBYTES ((GDB_SS+1)*4) -#endif /* CONFIG_X86_32 */ +#define GDB_ORIG_AX 57 +#define DBG_MAX_REG_NUM 20 +/* 17 64 bit regs and 3 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17ae2ae..bae89825e14 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -49,55 +49,94 @@ #include #include -/** - * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs - * @gdb_regs: A pointer to hold the registers in the order GDB wants. - * @regs: The &struct pt_regs of the current process. - * - * Convert the pt_regs in @regs into the format for registers that - * GDB expects, stored in @gdb_regs. - */ -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, + { "fs", 4, -1 }, + { "gs", 4, -1 }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, #endif - gdb_regs[GDB_AX] = regs->ax; - gdb_regs[GDB_BX] = regs->bx; - gdb_regs[GDB_CX] = regs->cx; - gdb_regs[GDB_DX] = regs->dx; - gdb_regs[GDB_SI] = regs->si; - gdb_regs[GDB_DI] = regs->di; - gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PC] = regs->ip; +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( #ifdef CONFIG_X86_32 - gdb_regs[GDB_PS] = regs->flags; - gdb_regs[GDB_DS] = regs->ds; - gdb_regs[GDB_ES] = regs->es; - gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_FS] = 0xFFFF; - gdb_regs[GDB_GS] = 0xFFFF; - if (user_mode_vm(regs)) { - gdb_regs[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; - } else { - gdb_regs[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; } -#else - gdb_regs[GDB_R8] = regs->r8; - gdb_regs[GDB_R9] = regs->r9; - gdb_regs[GDB_R10] = regs->r10; - gdb_regs[GDB_R11] = regs->r11; - gdb_regs[GDB_R12] = regs->r12; - gdb_regs[GDB_R13] = regs->r13; - gdb_regs[GDB_R14] = regs->r14; - gdb_regs[GDB_R15] = regs->r15; - gdb_regs32[GDB_PS] = regs->flags; - gdb_regs32[GDB_CS] = regs->cs; - gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + switch (regno) { +#ifdef CONFIG_X86_32 + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; #endif + } + return dbg_reg_def[regno].name; } /** @@ -150,47 +189,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_SP] = p->thread.sp; } -/** - * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. - * @gdb_regs: A pointer to hold the registers we've received from GDB. - * @regs: A pointer to a &struct pt_regs to hold these values in. - * - * Convert the GDB regs in @gdb_regs into the pt_regs, and store them - * in @regs. - */ -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; -#endif - regs->ax = gdb_regs[GDB_AX]; - regs->bx = gdb_regs[GDB_BX]; - regs->cx = gdb_regs[GDB_CX]; - regs->dx = gdb_regs[GDB_DX]; - regs->si = gdb_regs[GDB_SI]; - regs->di = gdb_regs[GDB_DI]; - regs->bp = gdb_regs[GDB_BP]; - regs->ip = gdb_regs[GDB_PC]; -#ifdef CONFIG_X86_32 - regs->flags = gdb_regs[GDB_PS]; - regs->ds = gdb_regs[GDB_DS]; - regs->es = gdb_regs[GDB_ES]; - regs->cs = gdb_regs[GDB_CS]; -#else - regs->r8 = gdb_regs[GDB_R8]; - regs->r9 = gdb_regs[GDB_R9]; - regs->r10 = gdb_regs[GDB_R10]; - regs->r11 = gdb_regs[GDB_R11]; - regs->r12 = gdb_regs[GDB_R12]; - regs->r13 = gdb_regs[GDB_R13]; - regs->r14 = gdb_regs[GDB_R14]; - regs->r15 = gdb_regs[GDB_R15]; - regs->flags = gdb_regs32[GDB_PS]; - regs->cs = gdb_regs32[GDB_CS]; - regs->ss = gdb_regs32[GDB_SS]; -#endif -} - static struct hw_breakpoint { unsigned enabled; unsigned long addr; -- cgit v1.2.3 From 9264b278be42c031dc76517a0d4bb154f5dcf470 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 5 Aug 2010 09:22:24 -0500 Subject: KGDB: Remove set but unused newPC Found by gcc 4.6's new warnings Signed-off-by: Andi Kleen Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bae89825e14..a8b80979ceb 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -456,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, { unsigned long addr; char *ptr; - int newPC; switch (remcomInBuffer[0]) { case 'c': @@ -467,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, linux_regs->ip = addr; case 'D': case 'k': - newPC = linux_regs->ip; - /* clear the trace bit */ linux_regs->flags &= ~X86_EFLAGS_TF; atomic_set(&kgdb_cpu_doing_single_step, -1); -- cgit v1.2.3 From df4939350b345ebb44937902827aa75b8ad4998c Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Thu, 5 Aug 2010 09:22:25 -0500 Subject: kgdb,x86: use macro HBP_NUM to replace magic number 4 Use the macros provided by the HW breakpoint API. Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a8b80979ceb..ef10940e1af 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -195,7 +195,7 @@ static struct hw_breakpoint { int len; int type; struct perf_event **pev; -} breakinfo[4]; +} breakinfo[HBP_NUM]; static unsigned long early_dr7; @@ -203,7 +203,7 @@ static void kgdb_correct_hw_break(void) { int breakno; - for (breakno = 0; breakno < 4; breakno++) { + for (breakno = 0; breakno < HBP_NUM; breakno++) { struct perf_event *bp; struct arch_hw_breakpoint *info; int val; @@ -290,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (breakinfo[i].addr == addr && breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; if (hw_break_release_slot(i)) { @@ -311,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) int cpu = raw_smp_processor_id(); struct perf_event *bp; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); @@ -331,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (!breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; switch (bptype) { @@ -395,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; if (dbg_is_early) { @@ -640,7 +640,7 @@ void kgdb_arch_late(void) attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); -- cgit v1.2.3 From 5989cd6a1cbf86587edcc856791f960978087311 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Aug 2010 13:30:27 -0700 Subject: x86, apic: Map the local apic when parsing the MP table. This fixes a regression in 2.6.35 from 2.6.34, that is present for select models of Intel cpus when people are using an MP table. The commit cf7500c0ea133d66f8449d86392d83f840102632 "x86, ioapic: In mpparse use mp_register_ioapic" started calling mp_register_ioapic from MP_ioapic_info. An extremely simple change that was obviously correct. Unfortunately mp_register_ioapic did just a little more than the previous hand crafted code and so we gained this call path. The problem call path is: MP_ioapic_info() mp_register_ioapic() io_apic_unique_id() io_apic_get_unique_id() get_physical_broadcast() modern_apic() lapic_get_version() apic_read(APIC_LVR) Which turned out to be a problem because the local apic was not mapped, at that point, unlike the similar point in the ACPI parsing code. This problem is fixed by mapping the local apic when parsing the mptable as soon as we reasonably can. Looking at the number of places we setup the fixmap for the local apic, I see some serious simplification opportunities. For the moment except for not duplicating the setting up of the fixmap in init_apic_mappings, I have not acted on them. The regression from 2.6.34 is tracked in bug https://bugzilla.kernel.org/show_bug.cgi?id=16173 Cc: 2.6.35 Reported-by: David Hill Reported-by: Tvrtko Ursulin Tested-by: Tvrtko Ursulin Signed-off-by: Eric W. Biederman LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/mpparse.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a96489ee6ca..c07e51391a3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void) * acpi lapic path already maps that address in * acpi_register_lapic_address() */ - if (!acpi_lapic) + if (!acpi_lapic && !smp_found_config) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d86dbf7e54b..d7b6f7fb4fe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } +static void __init smp_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; + /* Initialize the lapic mapping */ + if (!acpi_lapic) + smp_register_lapic_address(mpc->lapic); + if (mpc->oemptr) x86_init.mpparse.smp_read_mpc_oem(mpc); -- cgit v1.2.3 From 7645e4320497b35ce9fb6c2269ebcd57af9fe735 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 6 Aug 2010 12:18:11 -0700 Subject: x86, kvm: Remove cast obsoleted by set_64bit() prototype cleanup KVM ended up having to put a pretty ugly wrapper around set_64bit() in order to get the type right. Now set_64bit() takes the expected u64 type, and this wrapper can be cleaned up. Signed-off-by: H. Peter Anvin Cc: Avi Kivity LKML-Reference: <4C5C4E7A.8040603@kernel.org> Signed-off-by: Linus Torvalds --- arch/x86/kvm/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0dcc95e0987..311f6dad895 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -281,11 +281,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) static void __set_spte(u64 *sptep, u64 spte) { -#ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); -#else - set_64bit((unsigned long long *)sptep, spte); -#endif + set_64bit(sptep, spte); } static u64 __xchg_spte(u64 *sptep, u64 new_spte) -- cgit v1.2.3 From 7e005f79791dcd58436c88ded4a7f5aed1b82147 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 31 May 2010 15:59:04 +0900 Subject: remove needless ISA_DMA_THRESHOLD Architectures don't need to define ISA_DMA_THRESHOLD anymore. Signed-off-by: FUJITA Tomonori Acked-by: James Bottomley Acked-by: David Howells Signed-off-by: Jens Axboe --- arch/x86/include/asm/scatterlist.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index fb0b1874396..4240878b9d7 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -3,7 +3,6 @@ #include -#define ISA_DMA_THRESHOLD (0x00ffffff) #define ARCH_HAS_SG_CHAIN #endif /* _ASM_X86_SCATTERLIST_H */ -- cgit v1.2.3 From 1c250d709fdc8aa5bf42d90be99428a01a256a55 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 5 Aug 2010 19:09:17 +0400 Subject: perf, x86: P4 PMU -- update nmi irq statistics and unmask lvt entry properly In case if last active performance counter is not overflowed at moment of NMI being triggered by another counter, the irq statistics may miss an update stage. As a more serious consequence -- apic quirk may not be triggered so apic lvt entry stay masked. Tested-by: Lin Ming Signed-off-by: Cyrill Gorcunov Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20100805150917.GA6311@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 107711bf0ee..febb12cea79 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -656,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int overflow; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -666,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) WARN_ON_ONCE(hwc->idx != idx); /* it might be unflagged overflow */ - handled = p4_pmu_clear_cccr_ovf(hwc); + overflow = p4_pmu_clear_cccr_ovf(hwc); val = x86_perf_event_update(event); - if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue; + handled += overflow; + /* event overflow for sure */ data.period = event->hw.last_period; @@ -687,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled; + return handled > 0; } /* -- cgit v1.2.3 From 597781f3e51f48ef8e67be772196d9e9673752c4 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Mon, 9 Aug 2010 17:18:32 -0700 Subject: kmap_atomic: make kunmap_atomic() harder to misuse kunmap_atomic() is currently at level -4 on Rusty's "Hard To Misuse" list[1] ("Follow common convention and you'll get it wrong"), except in some architectures when CONFIG_DEBUG_HIGHMEM is set[2][3]. kunmap() takes a pointer to a struct page; kunmap_atomic(), however, takes takes a pointer to within the page itself. This seems to once in a while trip people up (the convention they are following is the one from kunmap()). Make it much harder to misuse, by moving it to level 9 on Rusty's list[4] ("The compiler/linker won't let you get it wrong"). This is done by refusing to build if the type of its first argument is a pointer to a struct page. The real kunmap_atomic() is renamed to kunmap_atomic_notypecheck() (which is what you would call in case for some strange reason calling it with a pointer to a struct page is not incorrect in your code). The previous version of this patch was compile tested on x86-64. [1] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html [2] In these cases, it is at level 5, "Do it right or it will always break at runtime." [3] At least mips and powerpc look very similar, and sparc also seems to share a common ancestor with both; there seems to be quite some degree of copy-and-paste coding here. The include/asm/highmem.h file for these three archs mention x86 CPUs at its top. [4] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html [5] As an aside, could someone tell me why mn10300 uses unsigned long as the first parameter of kunmap_atomic() instead of void *? Signed-off-by: Cesar Eduardo Barros Cc: Russell King (arch/arm) Cc: Ralf Baechle (arch/mips) Cc: David Howells (arch/frv, arch/mn10300) Cc: Koichi Yasutake (arch/mn10300) Cc: Kyle McMartin (arch/parisc) Cc: Helge Deller (arch/parisc) Cc: "James E.J. Bottomley" (arch/parisc) Cc: Benjamin Herrenschmidt (arch/powerpc) Cc: Paul Mackerras (arch/powerpc) Cc: "David S. Miller" (arch/sparc) Cc: Thomas Gleixner (arch/x86) Cc: Ingo Molnar (arch/x86) Cc: "H. Peter Anvin" (arch/x86) Cc: Arnd Bergmann (include/asm-generic) Cc: Rusty Russell ("Hard To Misuse" list) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/highmem.h | 2 +- arch/x86/mm/highmem_32.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a726650fc80..8caac76ac32 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -61,7 +61,7 @@ void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); +void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 63a6ba66cbe..5e8fa12ef86 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type) return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr) EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_notypecheck); EXPORT_SYMBOL(kmap_atomic_prot); EXPORT_SYMBOL(kmap_atomic_to_page); -- cgit v1.2.3 From 4e60c86bd9e5a7110ed28874d0b6592186550ae8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 9 Aug 2010 17:19:03 -0700 Subject: gcc-4.6: mm: fix unused but set warnings No real bugs, just some dead code and some fixups. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 181be528c61..076052cd62b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ +#define pte_unmap(pte) ((void)(pte))/* NOP */ +#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) -- cgit v1.2.3 From d7a7c573936a86474c4a5090a45a4bc6e680c117 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 9 Aug 2010 17:20:33 -0700 Subject: x86, ia64, smp: use workqueues unconditionally during do_boot_cpu() Workqueues are now initialized as part of the early_initcall(). So they are available for use during cold boot process aswell. Signed-off-by: Suresh Siddha Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tejun Heo Cc: Oleg Nesterov Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51620953b18..a5e928b0cb5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); -- cgit v1.2.3 From 8cbd84f2dd4e52a8771b191030c374ba3e56d291 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 10 Aug 2010 15:35:10 -0700 Subject: x86: fix up system call numbering nit As pointed out by Jiri Slaby: when I resolved the the 32-bit x85 system call entry tables for prlimit (due to the conflict with fanotify), I forgot to add the numbering in comments that we do for every fifth entry. Reported-by: Jiri Slaby Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 91dc4bb1303..b86feabed69 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -844,5 +844,5 @@ ia32_sys_call_table: .quad compat_sys_recvmmsg .quad sys_fanotify_init .quad sys32_fanotify_mark - .quad sys_prlimit64 + .quad sys_prlimit64 /* 340 */ ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4802accb9d8..b35786dc9b8 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -339,4 +339,4 @@ ENTRY(sys_call_table) .long sys_recvmmsg .long sys_fanotify_init .long sys_fanotify_mark - .long sys_prlimit64 + .long sys_prlimit64 /* 340 */ -- cgit v1.2.3 From 8fd49936a8cac246fc9ed85508556c82cd44cf68 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 11 Aug 2010 15:37:41 +0900 Subject: x86: Document __phys_reloc_hide() usage in __pa_symbol() Until all supported versions of gcc recognize -fno-strict-overflow, we should keep the RELOC_HIDE() magic in __pa_symbol(). Comment it. Signed-off-by: Namhyung Kim LKML-Reference: <1281508661-29507-1-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 625c3f0e741..8ca82839288 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ +/* + * We need __phys_reloc_hide() here because gcc may assume that there is no + * overflow during __pa() calculation and can optimize it unexpectedly. + * Newer versions of gcc provide -fno-strict-overflow switch to handle this + * case properly. Once all supported versions of gcc understand it, we can + * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) + */ #define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -- cgit v1.2.3 From 4565f0170dfc849b3629c27d769db800467baa62 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Aug 2010 18:03:22 -0700 Subject: dma-mapping: unify dma_get_cache_alignment implementations dma_get_cache_alignment returns the minimum DMA alignment. Architectures defines it as ARCH_DMA_MINALIGN (formally ARCH_KMALLOC_MINALIGN). So we can unify dma_get_cache_alignment implementations. Note that some architectures implement dma_get_cache_alignment wrongly. dma_get_cache_alignment() should return the minimum DMA alignment. So fully-coherent architectures should return 1. This patch also fixes this issue. Signed-off-by: FUJITA Tomonori Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/dma-mapping.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ac91eed2106..f9c67e83f64 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -87,13 +87,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size, flush_write_buffers(); } -static inline int dma_get_cache_alignment(void) -{ - /* no easy way to get cache size on all x86, so return the - * maximum possible, to be safe */ - return boot_cpu_data.x86_clflush_size; -} - static inline unsigned long dma_alloc_coherent_mask(struct device *dev, gfp_t gfp) { -- cgit v1.2.3 From 3b9c6c11f519718d618f5d7c9508daf78b207f6f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Aug 2010 18:03:25 -0700 Subject: dma-mapping: remove dma_is_consistent API Architectures implement dma_is_consistent() in different ways (some misinterpret the definition of API in DMA-API.txt). So it hasn't been so useful for drivers. We have only one user of the API in tree. Unlikely out-of-tree drivers use the API. Even if we fix dma_is_consistent() in some architectures, it doesn't look useful at all. It was invented long ago for some old systems that can't allocate coherent memory at all. It's better to export only APIs that are definitely necessary for drivers. Let's remove this API. Signed-off-by: FUJITA Tomonori Cc: James Bottomley Reviewed-by: Konrad Rzeszutek Wilk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f9c67e83f64..d4c419f883a 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) -#define dma_is_consistent(d, h) (1) extern int dma_supported(struct device *hwdev, u64 mask); extern int dma_set_mask(struct device *dev, u64 mask); -- cgit v1.2.3 From 30246557a06bb20618bed906a06d1e1e0faa8bb4 Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Fri, 6 Aug 2010 04:04:38 +0200 Subject: x86, asm: Refactor atomic64_386_32.S to support old binutils and be cleaner The old code didn't work on binutils 2.12 because setting a symbol to a register apparently requires a fairly recent version. This commit refactors the code to use the C preprocessor instead, and in the process makes the whole code a bit easier to understand. The object code produced is unchanged as expected. This fixes kernel bugzilla 16506. Reported-by: Dieter Stussy Signed-off-by: Luca Barbieri Signed-off-by: H. Peter Anvin Cc: 2.6.35 LKML-Reference: --- arch/x86/lib/atomic64_386_32.S | 236 ++++++++++++++++++++++------------------- 1 file changed, 128 insertions(+), 108 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 4a5979aa688..78ee8e0fa27 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -25,150 +25,170 @@ CFI_ADJUST_CFA_OFFSET -4 .endm -.macro BEGIN func reg -$v = \reg - -ENTRY(atomic64_\func\()_386) - CFI_STARTPROC - LOCK $v - -.macro RETURN - UNLOCK $v +#define BEGIN(op) \ +.macro END; \ + CFI_ENDPROC; \ +ENDPROC(atomic64_##op##_386); \ +.purgem END; \ +.endm; \ +ENTRY(atomic64_##op##_386); \ + CFI_STARTPROC; \ + LOCK v; + +#define RET \ + UNLOCK v; \ ret -.endm - -.macro END_ - CFI_ENDPROC -ENDPROC(atomic64_\func\()_386) -.purgem RETURN -.purgem END_ -.purgem END -.endm - -.macro END -RETURN -END_ -.endm -.endm - -BEGIN read %ecx - movl ($v), %eax - movl 4($v), %edx -END - -BEGIN set %esi - movl %ebx, ($v) - movl %ecx, 4($v) -END - -BEGIN xchg %esi - movl ($v), %eax - movl 4($v), %edx - movl %ebx, ($v) - movl %ecx, 4($v) -END - -BEGIN add %ecx - addl %eax, ($v) - adcl %edx, 4($v) -END -BEGIN add_return %ecx - addl ($v), %eax - adcl 4($v), %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN sub %ecx - subl %eax, ($v) - sbbl %edx, 4($v) -END - -BEGIN sub_return %ecx +#define RET_END \ + RET; \ + END + +#define v %ecx +BEGIN(read) + movl (v), %eax + movl 4(v), %edx +RET_END +#undef v + +#define v %esi +BEGIN(set) + movl %ebx, (v) + movl %ecx, 4(v) +RET_END +#undef v + +#define v %esi +BEGIN(xchg) + movl (v), %eax + movl 4(v), %edx + movl %ebx, (v) + movl %ecx, 4(v) +RET_END +#undef v + +#define v %ecx +BEGIN(add) + addl %eax, (v) + adcl %edx, 4(v) +RET_END +#undef v + +#define v %ecx +BEGIN(add_return) + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_END +#undef v + +#define v %ecx +BEGIN(sub) + subl %eax, (v) + sbbl %edx, 4(v) +RET_END +#undef v + +#define v %ecx +BEGIN(sub_return) negl %edx negl %eax sbbl $0, %edx - addl ($v), %eax - adcl 4($v), %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN inc %esi - addl $1, ($v) - adcl $0, 4($v) -END - -BEGIN inc_return %esi - movl ($v), %eax - movl 4($v), %edx + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_END +#undef v + +#define v %esi +BEGIN(inc) + addl $1, (v) + adcl $0, 4(v) +RET_END +#undef v + +#define v %esi +BEGIN(inc_return) + movl (v), %eax + movl 4(v), %edx addl $1, %eax adcl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN dec %esi - subl $1, ($v) - sbbl $0, 4($v) -END - -BEGIN dec_return %esi - movl ($v), %eax - movl 4($v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_END +#undef v + +#define v %esi +BEGIN(dec) + subl $1, (v) + sbbl $0, 4(v) +RET_END +#undef v + +#define v %esi +BEGIN(dec_return) + movl (v), %eax + movl 4(v), %edx subl $1, %eax sbbl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) -END + movl %eax, (v) + movl %edx, 4(v) +RET_END +#undef v -BEGIN add_unless %ecx +#define v %ecx +BEGIN(add_unless) addl %eax, %esi adcl %edx, %edi - addl ($v), %eax - adcl 4($v), %edx + addl (v), %eax + adcl 4(v), %edx cmpl %eax, %esi je 3f 1: - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) movl $1, %eax 2: -RETURN + RET 3: cmpl %edx, %edi jne 1b xorl %eax, %eax jmp 2b -END_ +END +#undef v -BEGIN inc_not_zero %esi - movl ($v), %eax - movl 4($v), %edx +#define v %esi +BEGIN(inc_not_zero) + movl (v), %eax + movl 4(v), %edx testl %eax, %eax je 3f 1: addl $1, %eax adcl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) movl $1, %eax 2: -RETURN + RET 3: testl %edx, %edx jne 1b jmp 2b -END_ +END +#undef v -BEGIN dec_if_positive %esi - movl ($v), %eax - movl 4($v), %edx +#define v %esi +BEGIN(dec_if_positive) + movl (v), %eax + movl 4(v), %edx subl $1, %eax sbbl $0, %edx js 1f - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) 1: -END +RET_END +#undef v -- cgit v1.2.3 From 417484d47e115774745ef025bce712a102b6f86f Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Thu, 12 Aug 2010 07:00:35 -0700 Subject: x86, asm: Use a lower case name for the end macro in atomic64_386_32.S Use a lowercase name for the end macro, which somehow fixes a binutils 2.16 problem. Signed-off-by: Luca Barbieri LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/lib/atomic64_386_32.S | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 78ee8e0fa27..2cda60a06e6 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -26,35 +26,37 @@ .endm #define BEGIN(op) \ -.macro END; \ +.macro endp; \ CFI_ENDPROC; \ ENDPROC(atomic64_##op##_386); \ -.purgem END; \ +.purgem endp; \ .endm; \ ENTRY(atomic64_##op##_386); \ CFI_STARTPROC; \ LOCK v; +#define ENDP endp + #define RET \ UNLOCK v; \ ret -#define RET_END \ +#define RET_ENDP \ RET; \ - END + ENDP #define v %ecx BEGIN(read) movl (v), %eax movl 4(v), %edx -RET_END +RET_ENDP #undef v #define v %esi BEGIN(set) movl %ebx, (v) movl %ecx, 4(v) -RET_END +RET_ENDP #undef v #define v %esi @@ -63,14 +65,14 @@ BEGIN(xchg) movl 4(v), %edx movl %ebx, (v) movl %ecx, 4(v) -RET_END +RET_ENDP #undef v #define v %ecx BEGIN(add) addl %eax, (v) adcl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %ecx @@ -79,14 +81,14 @@ BEGIN(add_return) adcl 4(v), %edx movl %eax, (v) movl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %ecx BEGIN(sub) subl %eax, (v) sbbl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %ecx @@ -98,14 +100,14 @@ BEGIN(sub_return) adcl 4(v), %edx movl %eax, (v) movl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %esi BEGIN(inc) addl $1, (v) adcl $0, 4(v) -RET_END +RET_ENDP #undef v #define v %esi @@ -116,14 +118,14 @@ BEGIN(inc_return) adcl $0, %edx movl %eax, (v) movl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %esi BEGIN(dec) subl $1, (v) sbbl $0, 4(v) -RET_END +RET_ENDP #undef v #define v %esi @@ -134,7 +136,7 @@ BEGIN(dec_return) sbbl $0, %edx movl %eax, (v) movl %edx, 4(v) -RET_END +RET_ENDP #undef v #define v %ecx @@ -156,7 +158,7 @@ BEGIN(add_unless) jne 1b xorl %eax, %eax jmp 2b -END +ENDP #undef v #define v %esi @@ -177,7 +179,7 @@ BEGIN(inc_not_zero) testl %edx, %edx jne 1b jmp 2b -END +ENDP #undef v #define v %esi @@ -190,5 +192,5 @@ BEGIN(dec_if_positive) movl %eax, (v) movl %edx, 4(v) 1: -RET_END +RET_ENDP #undef v -- cgit v1.2.3 From a3da323420d5aa6f7bd15efc7bf34cd6d19e1f1a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 8 Aug 2010 02:37:23 +0900 Subject: [CPUFREQ] add missing __percpu markup in pcc-cpufreq.c pcc_cpu_info is a percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5bbb62..994230d4dc4 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -110,7 +110,7 @@ struct pcc_cpu { u32 output_offset; }; -static struct pcc_cpu *pcc_cpu_info; +static struct pcc_cpu __percpu *pcc_cpu_info; static int pcc_cpufreq_verify(struct cpufreq_policy *policy) { -- cgit v1.2.3 From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 9 Aug 2010 14:20:10 -0700 Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods The current computation, introduced with f12a15be63, of FSEC_PER_SEC using the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only with 32bit integers on small machines, resulting in an overflow and a *very* short intervals being programmed. An interrupt storm follows. Note that we also have to specify FSEC_PER_SEC as being long long to overcome the same limitations. Signed-off-by: Chris Wilson Signed-off-by: John Stultz Cc: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 33dbcc4ec5f..351f9c0fea1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) * scaled math multiplication factor for nanosecond to hpet tick * conversion. */ - hpet_freq = 1000000000000000ULL; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); evt->mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, evt->shift); @@ -837,7 +837,7 @@ static int hpet_clocksource_register(void) * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period */ - hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); -- cgit v1.2.3 From 1d6225e8cc5598f2bc5c992f9c88b1137763e8e1 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 9 Aug 2010 16:11:22 -0500 Subject: x86, UV: Make kdump avoid stack dumps - fix !CONFIG_KEXEC breakage This replaces Version 1 of this patch, which broke the build when CONFIG_KEXEC and CONFIG_CRASH_DUMP were configured off. In that case the storage for the 'in_crash_kexec' flag was never built. This version defines that flag as 0 if CONFIG_KEXEC is not set. The patch is tested with all combinations of those two options. Signed-off-by: Cliff Wickman Cc: Andrew Morton LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/kdebug.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 7a2910b3512..5bdfca86581 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -33,6 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); +#ifdef CONFIG_KEXEC extern int in_crash_kexec; +#else +/* no crash dump is ever in progress if no crash kernel can be kexec'd */ +#define in_crash_kexec 0 +#endif #endif /* _ASM_X86_KDEBUG_H */ -- cgit v1.2.3 From 3f6c4df7e1f05f2fdb3f20cac9c03f595a72b45d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 13 Aug 2010 23:00:11 +0900 Subject: [CPUFREQ] acpi-cpufreq: add missing __percpu markup acpi_perf_data is a percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 246cd3afbb5..cd8da247dda 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -72,7 +72,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); /* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance *acpi_perf_data; +static struct acpi_processor_performance __percpu *acpi_perf_data; static struct cpufreq_driver acpi_cpufreq_driver; -- cgit v1.2.3 From 96054569190bdec375fe824e48ca1f4e3b53dd36 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 13 Aug 2010 09:49:20 -0700 Subject: x86: don't send SIGBUS for kernel page faults It's wrong for several reasons, but the most direct one is that the fault may be for the stack accesses to set up a previous SIGBUS. When we have a kernel exception, the kernel exception handler does all the fixups, not some user-level signal handler. Even apart from the nested SIGBUS issue, it's also wrong to give out kernel fault addresses in the signal handler info block, or to send a SIGBUS when a system call already returns EFAULT. Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f62777940df..4c4508e8a20 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) + if (!(error_code & PF_USER)) { no_context(regs, error_code, address); + return; + } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) -- cgit v1.2.3 From c7887325230aec47d47a32562a6e26014a0fafca Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Aug 2010 11:26:22 +0100 Subject: Mark arguments to certain syscalls as being const Mark arguments to certain system calls as being const where they should be but aren't. The list includes: (*) The filename arguments of various stat syscalls, execve(), various utimes syscalls and some mount syscalls. (*) The filename arguments of some syscall helpers relating to the above. (*) The buffer argument of various write syscalls. Signed-off-by: David Howells Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/x86/ia32/sys_ia32.c | 14 +++++++------- arch/x86/include/asm/sys_ia32.h | 12 ++++++------ arch/x86/include/asm/syscalls.h | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/process.c | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 3d093311d5e..849813f398e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -51,7 +51,7 @@ #define AA(__x) ((unsigned long)(__x)) -asmlinkage long sys32_truncate64(char __user *filename, +asmlinkage long sys32_truncate64(const char __user *filename, unsigned long offset_low, unsigned long offset_high) { @@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) return 0; } -asmlinkage long sys32_stat64(char __user *filename, +asmlinkage long sys32_stat64(const char __user *filename, struct stat64 __user *statbuf) { struct kstat stat; @@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename, return ret; } -asmlinkage long sys32_lstat64(char __user *filename, +asmlinkage long sys32_lstat64(const char __user *filename, struct stat64 __user *statbuf) { struct kstat stat; @@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } -asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, +asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename, struct stat64 __user *statbuf, int flag) { struct kstat stat; @@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, - u32 poslo, u32 poshi) +asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, + u32 count, u32 poslo, u32 poshi) { return sys_pwrite64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); @@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, +asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { long error; diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index cf4e2e381cb..cb238526a9f 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -18,13 +18,13 @@ #include /* ia32/sys_ia32.c */ -asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long); +asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long); asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); -asmlinkage long sys32_stat64(char __user *, struct stat64 __user *); -asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); +asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *); +asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *); asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); -asmlinkage long sys32_fstatat(unsigned int, char __user *, +asmlinkage long sys32_fstatat(unsigned int, const char __user *, struct stat64 __user *, int); struct mmap_arg_struct32; asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); @@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); -asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); +asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, +asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *, compat_uptr_t __user *, struct pt_regs *); asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 5c044b43e9a..feb2ff9bfc2 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); -long sys_execve(char __user *, char __user * __user *, +long sys_execve(const char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c5ea5cdbe7b..17be5ec7cbb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1185,13 +1185,13 @@ END(kernel_thread_helper) * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. * * C extern interface: - * extern long execve(char *name, char **argv, char **envp) + * extern long execve(const char *name, char **argv, char **envp) * * asm input arguments: * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) + * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d401f1d2d06..64ecaf0af9a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -301,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. */ -long sys_execve(char __user *name, char __user * __user *argv, +long sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { long error; -- cgit v1.2.3