aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/misc.c2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c1
-rw-r--r--arch/x86/include/asm/e820.h3
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/paravirt.h10
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h19
-rw-r--r--arch/x86/include/asm/xen/interface.h6
-rw-r--r--arch/x86/include/asm/xen/interface_32.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h13
-rw-r--r--arch/x86/include/asm/xen/page.h7
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c8
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c25
-rw-r--r--arch/x86/kernel/cpu/perf_event.c20
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/head_32.S16
-rw-r--r--arch/x86/kernel/hpet.c26
-rw-r--r--arch/x86/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/kernel/kgdb.c12
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c64
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/pvclock.c5
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/setup.c18
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/vmx.c24
-rw-r--r--arch/x86/kvm/x86.c11
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/boot.c16
-rw-r--r--arch/x86/lguest/i386_head.S105
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/pci/acpi.c103
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/xen.c27
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/platform/uv/uv_time.c4
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c25
-rw-r--r--arch/x86/xen/mmu.c88
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c53
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
56 files changed, 612 insertions, 247 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e8327686d3c..e330da21b84 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
- select HAVE_PERF_EVENTS if (!M386 && !M486)
+ select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 23f315c9f21..325c05294fc 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -355,7 +355,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
#else
- if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
+ if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93..7a6e68e4f74 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
* by the Free Software Foundation.
*/
+#include <linux/err.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 849813f398e..5852519b2d0 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -28,7 +28,6 @@
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/utsname.h>
-#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/poll.h>
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 5be1542fbfa..e99d55d74df 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,6 +72,9 @@ struct e820map {
#define BIOS_BEGIN 0x000a0000
#define BIOS_END 0x00100000
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
#ifdef __KERNEL__
/* see comment in arch/x86/kernel/e820.c */
extern struct e820map e820;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4d293dced62..9479a037419 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -216,8 +216,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
}
/* Return an pointer with offset calculated */
-static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
+static __always_inline unsigned long
+__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
{
__set_fixmap(idx, phys, flags);
return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e6fe391094..f702f82aa1e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,7 +79,7 @@
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3ea3dc48704..6b89f5e8602 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -128,7 +128,7 @@
#define FAM10H_MMIO_CONF_ENABLE (1<<0)
#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
-#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 18e3b8a8709..ef9975812c7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -824,27 +824,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
-static inline void arch_local_irq_restore(unsigned long f)
+static inline notrace void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long f;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7f7e577a0e3..31d84acc151 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);
+void pvclock_resume(void);
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index e969f691cbf..a501741c233 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,8 @@ union uvh_apicid {
#define UVH_APICID 0x002D0E00L
#define UV_APIC_PNODE_SHIFT 6
+#define UV_APICID_HIBIT_MASK 0xffff0000
+
/* Local Bus from cpu's perspective */
#define LOCAL_BUS_BASE 0x1c00000
#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
@@ -491,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
}
}
+extern unsigned int uv_apicid_hibits;
static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
{
+ apicid |= uv_apicid_hibits;
return (1UL << UVH_IPI_INT_SEND_SHFT) |
((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
(mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 6d90adf4428..20cafeac745 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
*
* SGI UV MMR definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u {
};
/* ========================================================================= */
+/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
+/* ========================================================================= */
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
+
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uvh_lb_target_physical_apic_id_mask_u {
+ unsigned long v;
+ struct uvh_lb_target_physical_apic_id_mask_s {
+ unsigned long bit_enables : 32; /* RW */
+ unsigned long rsvd_32_63 : 32; /* */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_NODE_ID */
/* ========================================================================= */
#define UVH_NODE_ID 0x0UL
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c5..1c10c88ee4e 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
#endif
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 32
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5..8413688b257 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
/* And the trap vector is... */
#define TRAP_INSTR "int $0x82"
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END 0xF6800000
+
+#define __MACH2PHYS_SHIFT 2
+
/*
* Virtual addresses beyond this are not modifiable by guest OSes. The
* machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97..839a4811cf9 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
-#endif
-
-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define __MACH2PHYS_SHIFT 3
/*
* int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index dd8c1414b3d..8760cc60a21 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -35,6 +36,8 @@ typedef struct xpaddr {
#define MAX_DOMAIN_PAGES \
((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -69,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
-#if 0
if (unlikely((mfn >> machine_to_phys_order) != 0))
- return max_mapnr;
-#endif
+ return ~0;
pfn = 0;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9e13763b609..1e994754d32 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -45,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
+obj-y += resource.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3f838d53739..78218135b48 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1389,6 +1389,14 @@ void __cpuinit end_local_APIC_setup(void)
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
+
+ /*
+ * Now that local APIC setup is completed for BP, configure the fault
+ * handling for interrupt remapping.
+ */
+ if (!smp_processor_id() && intr_remapping_enabled)
+ enable_drhd_fault_handling();
+
}
#ifdef CONFIG_X86_X2APIC
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e..62f6e1e55b9 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,15 +17,16 @@
#include <linux/nmi.h>
#include <linux/module.h>
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
u64 hw_nmi_get_sample_period(void)
{
return (u64)(cpu_khz) * 1000 * 60;
}
#ifdef ARCH_HAS_NMI_WATCHDOG
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
void arch_trigger_all_cpu_backtrace(void)
{
int i;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7cc0a721f62..fadcd743a74 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2430,13 +2430,12 @@ static void ack_apic_level(struct irq_data *data)
{
struct irq_cfg *cfg = data->chip_data;
int i, do_unmask_irq = 0, irq = data->irq;
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long v;
irq_complete_move(cfg);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+ if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
mask_ioapic(cfg);
}
@@ -3413,6 +3412,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
dmar_msi_write(irq, &msg);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index f9e4e6a5407..d8c4a6feb28 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void)
/* need to update phys_pkg_id */
apic->phys_pkg_id = apicid_phys_pkg_id;
}
-
- /*
- * Now that apic routing model is selected, configure the
- * fault handling for intr remapping.
- */
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 194539aea17..c1c52c341f4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -44,6 +44,8 @@ static u64 gru_start_paddr, gru_end_paddr;
static union uvh_apicid uvh_apicid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static DEFINE_SPINLOCK(uv_nmi_lock);
static inline bool is_GRU_range(u64 start, u64 end)
@@ -85,6 +87,23 @@ static void __init early_get_apic_pnode_shift(void)
uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
}
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB. This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+ union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+ unsigned long *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE |
+ UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
+ apicid_mask.v = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+}
+
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
int nodeid;
@@ -102,6 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
__get_cpu_var(x2apic_extra_bits) =
nodeid << (uvh_apicid.s.pnode_shift - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
+ uv_set_apicid_hibit();
return 1;
}
}
@@ -155,6 +175,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
+ phys_apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -236,7 +257,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
int cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
else
return BAD_APICID;
}
@@ -255,7 +276,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- return per_cpu(x86_cpu_to_apicid, cpu);
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed6310183ef..6d75b9145b1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -381,6 +381,20 @@ static void release_pmc_hardware(void) {}
#endif
+static bool check_hw_exists(void)
+{
+ u64 val, val_new = 0;
+ int ret = 0;
+
+ val = 0xabcdUL;
+ ret |= checking_wrmsrl(x86_pmu.perfctr, val);
+ ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
+ if (ret || val != val_new)
+ return false;
+
+ return true;
+}
+
static void reserve_ds_buffers(void);
static void release_ds_buffers(void);
@@ -1372,6 +1386,12 @@ void __init init_hw_perf_events(void)
pmu_check_apic();
+ /* sanity check that the hardware exists or is emulated */
+ if (!check_hw_exists()) {
+ pr_cont("Broken PMU hardware detected, software events only.\n");
+ return;
+ }
+
pr_cont("%s PMU driver.\n", x86_pmu.name);
if (x86_pmu.quirks)
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d8..212a6a42527 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 59e175e8959..591e6010427 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe2690d71c0..e3ba417e869 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,6 +295,7 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_args)
XCPT_FRAME
cld
@@ -334,6 +335,7 @@ ENTRY(save_args)
ret
CFI_ENDPROC
END(save_args)
+ .popsection
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91dd31..c0dbd9ac24f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,16 +60,18 @@
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
+/* Number of possible pages in the lowmem region */
+LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
- PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
/*
* Worst-case size of the kernel mapping we need to make:
- * the worst-case size of the kernel itself, plus the extra we need
- * to map for the linear map.
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
*/
-KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+KERNEL_PAGES = LOWMEM_PAGES
INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
RESERVE_BRK(pagetables, INIT_MAP_SIZE)
@@ -620,13 +622,13 @@ ENTRY(initial_code)
__PAGE_ALIGNED_BSS
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
-initial_pg_pmd:
+ENTRY(initial_pg_pmd)
.fill 1024*KPMDS,4,0
#else
ENTRY(initial_page_table)
.fill 1024,4,0
#endif
-initial_pg_fixmap:
+ENTRY(initial_pg_fixmap)
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ae03cab4352..4ff5968f12d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
#define HPET_DEV_FSB_CAP 0x1000
#define HPET_DEV_PERI_CAP 0x2000
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
+
#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
/*
@@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void)
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
- /* 5 usec minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = 5000;
+ /* Setup minimum reprogramming delta. */
+ hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
+ &hpet_clockevent);
/*
* Start hpet with the boot cpu mask and make it
@@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta,
* the wraparound into account) nor a simple count down event
* mode. Further the write to the comparator register is
* delayed internally up to two HPET clock cycles in certain
- * chipsets (ATI, ICH9,10). We worked around that by reading
- * back the compare register, but that required another
- * workaround for ICH9,10 chips where the first readout after
- * write can return the old stale value. We already have a
- * minimum delta of 5us enforced, but a NMI or SMI hitting
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
* between the counter readout and the comparator write can
* move us behind that point easily. Now instead of reading
* the compare register back several times, we make the ETIME
* decision based on the following: Return ETIME if the
- * counter value after the write is less than 8 HPET cycles
+ * counter value after the write is less than HPET_MIN_CYCLES
* away from the event or if the counter is already ahead of
- * the event.
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
*/
res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- return res < 8 ? -ETIME : 0;
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
static void hpet_legacy_set_mode(enum clock_event_mode mode,
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25..42c59425450 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
dr6_p = (unsigned long *)ERR_PTR(args->err);
dr6 = *dr6_p;
+ /* If it's a single step, TRAP bits are random */
+ if (dr6 & DR_STEP)
+ return NOTIFY_DONE;
+
/* Do an early return if no trap bits are set in DR6 */
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ec592caac4b..cd21b654dec 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -315,14 +315,18 @@ static void kgdb_remove_all_hw_break(void)
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
- if (bp->attr.disabled == 1)
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
continue;
+ }
if (dbg_is_early)
early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
breakinfo[i].type);
- else
- arch_uninstall_hw_breakpoint(bp);
- bp->attr.disabled = 1;
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
}
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 6da143c2a6b..ac861b8348e 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
};
static u64 __cpuinitdata fam10h_pci_mmconf_base;
-static int __cpuinitdata fam10h_pci_mmconf_base_status;
static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
return start1 - start2;
}
-/*[47:0] */
-/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
-#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32)))
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
static void __cpuinit get_fam10h_pci_mmconf_base(void)
{
int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
struct range range[8];
/* only try to get setting from BSP */
- /* -1 or 1 */
- if (fam10h_pci_mmconf_base_status)
+ if (fam10h_pci_mmconf_base)
return;
if (!early_pci_allowed())
- goto fail;
+ return;
found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
}
if (!found)
- goto fail;
+ return;
/* SYS_CFG */
address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
/* TOP_MEM2 is not enabled? */
if (!(val & (1<<21))) {
- tom2 = 0;
+ tom2 = 1ULL << 32;
} else {
/* TOP_MEM2 */
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
- tom2 = val & (0xffffULL<<32);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
}
if (base <= tom2)
- base = tom2 + (1ULL<<32);
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
/*
* need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (!(reg & 3))
continue;
- start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
- end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
- if (!end)
+ if (end < tom2)
continue;
range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (range[hi_mmio_num - 1].end < base)
goto out;
- if (range[0].start > base)
+ if (range[0].start > base + MMCONF_SIZE)
goto out;
/* need to find one window */
- base = range[0].start - (1ULL << 32);
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
if ((base > tom2) && BASE_VALID(base))
goto out;
- base = range[hi_mmio_num - 1].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
goto out;
/* need to find window between ranges */
- if (hi_mmio_num > 1)
- for (i = 0; i < hi_mmio_num - 1; i++) {
- if (range[i + 1].start > (range[i].end + (1ULL << 32))) {
- base = range[i].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
- goto out;
- }
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
}
-
-fail:
- fam10h_pci_mmconf_base_status = -1;
return;
+
out:
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
}
void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
/* only trust the one handle 256 buses, if acpi=off */
if (!acpi_pci_disabled || busnbits >= 8) {
- u64 base;
- base = val & (0xffffULL << 32);
- if (fam10h_pci_mmconf_base_status <= 0) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
return;
} else if (fam10h_pci_mmconf_base == base)
return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
* with 256 buses
*/
get_fam10h_pci_mmconf_base();
- if (fam10h_pci_mmconf_base_status <= 0)
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
return;
+ }
printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f7..12fcbe2c143 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 008b91eefa1..42eb3300dfc 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -83,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
static atomic64_t last_value = ATOMIC64_INIT(0);
+void pvclock_resume(void)
+{
+ atomic64_set(&last_value, 0);
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 00000000000..2a26819bb6a
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
+#include <linux/ioport.h>
+#include <asm/e820.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820entry *entry;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ entry = &e820.map[i];
+
+ resource_clip(avail, entry->addr,
+ entry->addr + entry->size - 1);
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ if (avail->flags & IORESOURCE_MEM) {
+ if (avail->start < BIOS_END)
+ avail->start = BIOS_END;
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 21c6746338a..a0f52af256a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void)
return total << PAGE_SHIFT;
}
-#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
+/*
+ * Keep the crash kernel below this limit. On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
+ * limit once kexec-tools are fixed.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_KERNEL_ADDR_MAX (512 << 20)
+#else
+# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+#endif
+
static void __init reserve_crashkernel(void)
{
unsigned long long total_mem;
@@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void)
const unsigned long long alignment = 16<<20; /* 16M */
/*
- * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
+ * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
crash_base = memblock_find_in_range(alignment,
- DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
+ CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
if (crash_base == MEMBLOCK_ERROR) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
@@ -769,7 +780,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
- resource_alloc_from_bottom = 0;
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e2..547128546cc 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
* Setup init_xstate_buf to represent the init state of
* all the features managed by the xsave
*/
- init_xstate_buf = alloc_bootmem(xstate_size);
+ init_xstate_buf = alloc_bootmem_align(xstate_size,
+ __alignof__(struct xsave_struct));
init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
clts();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 82e144a4e51..b81a9b7c2ca 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3395,6 +3395,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
load_host_msrs(vcpu);
+ kvm_load_ldt(ldt_selector);
loadsegment(fs, fs_selector);
#ifdef CONFIG_X86_64
load_gs_index(gs_selector);
@@ -3402,7 +3403,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#else
loadsegment(gs, gs_selector);
#endif
- kvm_load_ldt(ldt_selector);
reload_tss(vcpu);
@@ -3494,6 +3494,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
+ case 0x00000001:
+ /* Mask out xsave bit as long as it is not supported by SVM */
+ entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
+ break;
case 0x80000001:
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8da0e45ff7c..81fcbe9515c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -821,10 +821,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
- }
#endif
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -839,23 +838,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
- if (vmx->host_state.fs_reload_needed)
- loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu))
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(vmx->host_state.gs_sel);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
#else
loadsegment(gs, vmx->host_state.gs_sel);
#endif
}
+ if (vmx->host_state.fs_reload_needed)
+ loadsegment(fs, vmx->host_state.fs_sel);
reload_tss();
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- }
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
if (current_thread_info()->status & TS_USEDFPU)
clts();
@@ -4228,11 +4227,6 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdac9e592aa..b989e1f1e5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -155,11 +155,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -4569,9 +4564,11 @@ static void kvm_timer_init(void)
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
memset(&policy, 0, sizeof(policy));
- cpufreq_get_policy(&policy, get_cpu());
+ cpu = get_cpu();
+ cpufreq_get_policy(&policy, cpu);
if (policy.cpuinfo.max_freq)
max_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
@@ -5522,6 +5519,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (sregs->cr4 & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
mmu_reset_needed = 1;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414489f..c600da830ce 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 73b1e1a1f48..4996cf5f73a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3)
{
lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- cr3_changed = true;
+
+ /* These two page tables are simple, linear, and used during boot */
+ if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
+ cr3_changed = true;
}
static unsigned long lguest_read_cr3(void)
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* to forget all of them. Fortunately, this is very rare.
*
* ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
- * the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds.
+ * which makes booting astonishingly slow: 48 seconds! So we don't even tell
+ * the Host anything changed until we've done the first real page table switch,
+ * which brings boot back to 4.3 seconds.
*/
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
@@ -1002,7 +1005,7 @@ static void lguest_time_init(void)
clockevents_register_device(&lguest_clockevent);
/* Finally, we unblock the timer interrupt. */
- enable_lguest_irq(0);
+ clear_bit(0, lguest_data.blocked_interrupts);
}
/*
@@ -1349,9 +1352,6 @@ __init void lguest_init(void)
*/
switch_to_new_gdt(0);
- /* We actually boot with all memory mapped, but let's say 128MB. */
- max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
-
/*
* The Host<->Guest Switcher lives at the top of our address space, and
* the Host told us how big it is when we made LGUEST_INIT hypercall:
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..e7d5382ef26 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -4,6 +4,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
/*G:020
* Our story starts with the kernel booting into startup_32 in
@@ -37,9 +38,113 @@ ENTRY(lguest_entry)
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
+ call init_pagetables
+
/* Jumps are relative: we're running __PAGE_OFFSET too low. */
jmp lguest_init+__PAGE_OFFSET
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they
+ * don't have a stack at this point, so we can't just use call and ret.
+ */
+init_pagetables:
+#if PTRS_PER_PMD > 1
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
+#else
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
+#endif
+#define pa(X) ((X) - __PAGE_OFFSET)
+
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = \
+ PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+#ifdef CONFIG_X86_PAE
+
+ /*
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD.
+ *
+ * Note the upper half of each PMD or PTE are always zero at this stage.
+ */
+
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+
+ xorl %ebx,%ebx /* %ebx is kept at zero */
+
+ movl $pa(__brk_base), %edi
+ movl $pa(initial_pg_pmd), %edx
+ movl $PTE_IDENT_ATTR, %eax
+10:
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
+ movl %ecx,(%edx) /* Store PMD entry */
+ /* Upper half already zero */
+ addl $8,%edx
+ movl $512,%ecx
+11:
+ stosl
+ xchgl %eax,%ebx
+ stosl
+ xchgl %eax,%ebx
+ addl $0x1000,%eax
+ loop 11b
+
+ /*
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
+ */
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
+ cmpl %ebp,%eax
+ jb 10b
+1:
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
+
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else /* Not PAE */
+
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+ movl $pa(__brk_base), %edi
+ movl $pa(initial_page_table), %edx
+ movl $PTE_IDENT_ATTR, %eax
+10:
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
+ movl %ecx,(%edx) /* Store identity PDE entry */
+ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
+ addl $4,%edx
+ movl $1024, %ecx
+11:
+ stosl
+ addl $0x1000,%eax
+ loop 11b
+ /*
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
+ */
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
+ cmpl %ebp,%eax
+ jb 10b
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
+
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_page_table+0xffc)
+#endif
+ ret
+
/*G:055
* We create a macro which puts the assembler code between lgstart_ and lgend_
* markers. These templates are put in the .text section: they can't be
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad1..6acc724d5d8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static void __cpuinit calculate_tlb_offset(void)
{
- int cpu, node, nr_node_vecs;
+ int cpu, node, nr_node_vecs, idx = 0;
/*
* we are changing tlb_vector_offset for each CPU in runtime, but this
* will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
for_each_online_node(node) {
- int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+ int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
nr_node_vecs;
int cpu_offset = 0;
for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void)
cpu_offset++;
cpu_offset = cpu_offset % nr_node_vecs;
}
+ idx++;
}
}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 15466c096ba..0972315c386 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
struct acpi_resource_address64 addr;
acpi_status status;
unsigned long flags;
- struct resource *root, *conflict;
u64 start, end;
status = resource_to_addr(acpi_res, &addr);
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
if (addr.resource_type == ACPI_MEMORY_RANGE) {
- root = &iomem_resource;
flags = IORESOURCE_MEM;
if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
flags |= IORESOURCE_PREFETCH;
} else if (addr.resource_type == ACPI_IO_RANGE) {
- root = &ioport_resource;
flags = IORESOURCE_IO;
} else
return AE_OK;
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
}
- conflict = insert_resource_conflict(root, res);
- if (conflict) {
- dev_err(&info->bridge->dev,
- "address space collision: host bridge window %pR "
- "conflicts with %s %pR\n",
- res, conflict->name, conflict);
- } else {
- pci_bus_add_resource(info->bus, res, 0);
- info->res_num++;
- if (addr.translation_offset)
- dev_info(&info->bridge->dev, "host bridge window %pR "
- "(PCI address [%#llx-%#llx])\n",
- res, res->start - addr.translation_offset,
- res->end - addr.translation_offset);
+ info->res_num++;
+ if (addr.translation_offset)
+ dev_info(&info->bridge->dev, "host bridge window %pR "
+ "(PCI address [%#llx-%#llx])\n",
+ res, res->start - addr.translation_offset,
+ res->end - addr.translation_offset);
+ else
+ dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
+
+ return AE_OK;
+}
+
+static bool resource_contains(struct resource *res, resource_size_t point)
+{
+ if (res->start <= point && point <= res->end)
+ return true;
+ return false;
+}
+
+static void coalesce_windows(struct pci_root_info *info, int type)
+{
+ int i, j;
+ struct resource *res1, *res2;
+
+ for (i = 0; i < info->res_num; i++) {
+ res1 = &info->res[i];
+ if (!(res1->flags & type))
+ continue;
+
+ for (j = i + 1; j < info->res_num; j++) {
+ res2 = &info->res[j];
+ if (!(res2->flags & type))
+ continue;
+
+ /*
+ * I don't like throwing away windows because then
+ * our resources no longer match the ACPI _CRS, but
+ * the kernel resource tree doesn't allow overlaps.
+ */
+ if (resource_contains(res1, res2->start) ||
+ resource_contains(res1, res2->end) ||
+ resource_contains(res2, res1->start) ||
+ resource_contains(res2, res1->end)) {
+ res1->start = min(res1->start, res2->start);
+ res1->end = max(res1->end, res2->end);
+ dev_info(&info->bridge->dev,
+ "host bridge window expanded to %pR; %pR ignored\n",
+ res1, res2);
+ res2->flags = 0;
+ }
+ }
+ }
+}
+
+static void add_resources(struct pci_root_info *info)
+{
+ int i;
+ struct resource *res, *root, *conflict;
+
+ if (!pci_use_crs)
+ return;
+
+ coalesce_windows(info, IORESOURCE_MEM);
+ coalesce_windows(info, IORESOURCE_IO);
+
+ for (i = 0; i < info->res_num; i++) {
+ res = &info->res[i];
+
+ if (res->flags & IORESOURCE_MEM)
+ root = &iomem_resource;
+ else if (res->flags & IORESOURCE_IO)
+ root = &ioport_resource;
else
- dev_info(&info->bridge->dev,
- "host bridge window %pR\n", res);
+ continue;
+
+ conflict = insert_resource_conflict(root, res);
+ if (conflict)
+ dev_err(&info->bridge->dev,
+ "address space collision: host bridge window %pR "
+ "conflicts with %s %pR\n",
+ res, conflict->name, conflict);
+ else
+ pci_bus_add_resource(info->bus, res, 0);
}
- return AE_OK;
}
static void
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum,
acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
&info);
+ add_resources(&info);
return;
name_alloc_fail:
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index c4bb261c106..b1805b78842 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,21 +65,13 @@ pcibios_align_resource(void *data, const struct resource *res,
resource_size_t size, resource_size_t align)
{
struct pci_dev *dev = data;
- resource_size_t start = round_down(res->end - size + 1, align);
+ resource_size_t start = res->start;
if (res->flags & IORESOURCE_IO) {
-
- /*
- * If we're avoiding ISA aliases, the largest contiguous I/O
- * port space is 256 bytes. Clearing bits 9 and 10 preserves
- * all 256-byte and smaller alignments, so the result will
- * still be correctly aligned.
- */
- if (!skip_isa_ioresource_align(dev))
- start &= ~0x300;
- } else if (res->flags & IORESOURCE_MEM) {
- if (start < BIOS_END)
- start = res->end; /* fail; no space */
+ if (skip_isa_ioresource_align(dev))
+ return start;
+ if (start & 0x300)
+ start = (start + 0x3ff) & ~0x3ff;
}
return start;
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d7b5109f7a9..25cd4a07d09 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -70,6 +70,9 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
+#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
+ MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
+
static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
struct msi_msg *msg)
{
@@ -83,12 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(pirq);
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- /* delivery mode reserved */
- (3 << 8) |
- MSI_DATA_VECTOR(0);
+ msg->data = XEN_PIRQ_MSI_DATA;
}
static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -98,8 +96,23 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
struct msi_msg msg;
list_for_each_entry(msidesc, &dev->msi_list, list) {
+ __read_msi_msg(msidesc, &msg);
+ pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
+ ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
+ if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
+ xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
+ if (irq < 0)
+ goto error;
+ ret = set_irq_msi(irq, msidesc);
+ if (ret < 0)
+ goto error_while;
+ printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
+ " pirq=%d\n", irq, pirq);
+ return 0;
+ }
xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi", &irq, &pirq);
+ "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
if (irq < 0 || pirq < 0)
goto error;
printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a318194002b..ba9caa808a9 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector)
* the below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS
*/
- apicid = uvhub_to_first_apicid(uvhub);
+ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
((apicid << 32) | vector));
}
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379..9daf5d1af9f 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu)
apicid = cpu_physical_id(cpu);
pnode = uv_apicid_to_pnode(apicid);
+ apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
@@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode)
static int uv_setup_intr(int cpu, u64 expires)
{
u64 val;
+ unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
int pnode = uv_cpu_to_pnode(cpu);
uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
@@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires)
UVH_EVENT_OCCURRED0_RTC1_MASK);
val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
- ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+ ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
/* Set configuration */
uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4a2afa1bac5..b6552b189bc 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
export CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
+VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
+VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
# This makes sure the $(obj) subdirectory exists even though vdso32/
# is not a kbuild sub-make subdirectory.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 235c0f4d386..44dcad43989 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
@@ -1016,10 +1021,6 @@ static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
-#ifdef CONFIG_SMP
- stop_other_cpus();
-#endif
-
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
}
@@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
+ struct physdev_set_iopl set_iopl;
+ int rc;
pgd_t *pgd;
if (!xen_start_info)
@@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
+ xen_setup_machphys_mapping();
+
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
@@ -1191,8 +1196,6 @@ asmlinkage void __init xen_start_kernel(void)
/* Allocate and initialize top and mid mfn levels for p2m structure */
xen_build_mfn_list_list();
- init_mm.pgd = pgd;
-
/* keep using Xen gdt for now; no urgent need to change it */
#ifdef CONFIG_X86_32
@@ -1202,10 +1205,18 @@ asmlinkage void __init xen_start_kernel(void)
#else
pv_info.kernel_rpl = 0;
#endif
-
/* set the limit of our address space */
xen_reserve_top();
+ /* We used to do this in xen_arch_setup, but that is too late on AMD
+ * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
+ * which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 21ed8d7f75a..44924e551fd 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2034,6 +2034,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
set_page_prot(pmd, PAGE_KERNEL_RO);
}
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ }
+ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+}
+
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
@@ -2119,44 +2133,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
return pgd;
}
#else /* !CONFIG_X86_64 */
-static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+
+static __init void xen_write_cr3_init(unsigned long cr3)
+{
+ unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+ BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+ /*
+ * We are switching to swapper_pg_dir for the first time (from
+ * initial_page_table) and therefore need to mark that page
+ * read-only and then pin it.
+ *
+ * Xen disallows sharing of kernel PMDs for PAE
+ * guests. Therefore we must copy the kernel PMD from
+ * initial_page_table into a new kernel PMD to be used in
+ * swapper_pg_dir.
+ */
+ swapper_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ memcpy(swapper_kernel_pmd, initial_kernel_pmd,
+ sizeof(pmd_t) * PTRS_PER_PMD);
+ swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+ set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ xen_write_cr3(cr3);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ set_page_prot(initial_page_table, PAGE_KERNEL);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
- level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ initial_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
- memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+ memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
- xen_map_identity_early(level2_kernel_pgt, max_pfn);
+ xen_map_identity_early(initial_kernel_pmd, max_pfn);
- memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
- set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
- __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+ memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+ initial_page_table[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+ set_page_prot(initial_page_table, PAGE_KERNEL_RO);
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- xen_write_cr3(__pa(swapper_pg_dir));
-
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ xen_write_cr3(__pa(initial_page_table));
memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
- return swapper_pg_dir;
+ return initial_page_table;
}
#endif /* CONFIG_X86_64 */
@@ -2290,7 +2343,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
+#ifdef CONFIG_X86_32
+ .write_cr3 = xen_write_cr3_init,
+#else
.write_cr3 = xen_write_cr3,
+#endif
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -2358,8 +2415,6 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
- vmap_lazy_unmap = false;
-
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
@@ -2627,7 +2682,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
+ (VM_PFNMAP | VM_RESERVED | VM_IO)));
rmd.mfn = mfn;
rmd.prot = prot;
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce..25c52f94a27 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
return 0;
}
-void __init xen_unplug_emulated_devices(void)
+void xen_unplug_emulated_devices(void)
{
int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 769c4b01fa3..b5a7f928234 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -23,7 +23,6 @@
#include <xen/interface/callback.h>
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
-#include <xen/interface/memory.h>
#include <xen/features.h>
#include "xen-ops.h"
@@ -182,24 +181,21 @@ char * __init xen_memory_setup(void)
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end = map[i].addr + map[i].size;
- if (map[i].type == E820_RAM) {
- if (map[i].addr < mem_end && end > mem_end) {
- /* Truncate region to max_mem. */
- u64 delta = end - mem_end;
+ if (map[i].type == E820_RAM && end > mem_end) {
+ /* RAM off the end - may be partially included */
+ u64 delta = min(map[i].size, end - mem_end);
- map[i].size -= delta;
- extra_pages += PFN_DOWN(delta);
+ map[i].size -= delta;
+ end -= delta;
- end = mem_end;
- }
+ extra_pages += PFN_DOWN(delta);
}
- if (end > xen_extra_mem_start)
+ if (map[i].size > 0 && end > xen_extra_mem_start)
xen_extra_mem_start = end;
- /* If region is non-RAM or below mem_end, add what remains */
- if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
- map[i].size > 0)
+ /* Add region if any remains */
+ if (map[i].size > 0)
e820_add_region(map[i].addr, map[i].size, map[i].type);
}
@@ -248,26 +244,11 @@ char * __init xen_memory_setup(void)
else
extra_pages = 0;
- if (!xen_initial_domain())
- xen_add_extra_mem(extra_pages);
+ xen_add_extra_mem(extra_pages);
return "Xen";
}
-static void xen_idle(void)
-{
- local_irq_disable();
-
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
-}
-
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -337,9 +318,6 @@ void __cpuinit xen_enable_syscall(void)
void __init xen_arch_setup(void)
{
- struct physdev_set_iopl set_iopl;
- int rc;
-
xen_panic_handler_init();
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -356,11 +334,6 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- printk(KERN_INFO "physdev_op failed %d\n", rc);
-
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -372,7 +345,11 @@ void __init xen_arch_setup(void)
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
- pm_idle = xen_idle;
+ /* Set up idle, making sure it calls safe_halt() pvop */
+#ifdef CONFIG_X86_32
+ boot_cpu_data.hlt_works_ok = 1;
+#endif
+ pm_idle = default_idle;
fiddle_vdso();
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877..9bbd63a129b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
int cpu;
xen_hvm_init_shared_info();
xen_callback_vector();
+ xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
for_each_online_cpu(cpu) {
xen_setup_runstate_info(cpu);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b05..5da5e53fb94 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -426,6 +426,8 @@ void xen_timer_resume(void)
{
int cpu;
+ pvclock_resume();
+
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 64044747348..9d41bf98575 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -43,7 +43,7 @@ void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
-void __init xen_unplug_emulated_devices(void);
+void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);