aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 11:06:22 -0800
committerJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 11:06:22 -0800
commit20b4755e4fbb226eb42951bd40b53fcbce9ef944 (patch)
tree43da70e0b32ee423d3643ecd422821383411ab72 /arch/x86
parent744f9f104ea262de1dc3e29265870c649f0d9473 (diff)
parente53beacd23d9cb47590da6a7a7f6d417b941a994 (diff)
downloadlinux-linaro-android-20b4755e4fbb226eb42951bd40b53fcbce9ef944.tar.gz
Merge commit 'v2.6.37-rc2' into upstream/xenfs
* commit 'v2.6.37-rc2': (10093 commits) Linux 2.6.37-rc2 capabilities/syslog: open code cap_syslog logic to fix build failure i2c: Sanity checks on adapter registration i2c: Mark i2c_adapter.id as deprecated i2c: Drivers shouldn't include <linux/i2c-id.h> i2c: Delete unused adapter IDs i2c: Remove obsolete cleanup for clientdata include/linux/kernel.h: Move logging bits to include/linux/printk.h Fix gcc 4.5.1 miscompiling drivers/char/i8k.c (again) hwmon: (w83795) Check for BEEP pin availability hwmon: (w83795) Clear intrusion alarm immediately hwmon: (w83795) Read the intrusion state properly hwmon: (w83795) Print the actual temperature channels as sources hwmon: (w83795) List all usable temperature sources hwmon: (w83795) Expose fan control method hwmon: (w83795) Fix fan control mode attributes hwmon: (lm95241) Check validity of input values hwmon: Change mail address of Hans J. Koch PCI: sysfs: fix printk warnings GFS2: Fix inode deallocation race ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig154
-rw-r--r--arch/x86/Kconfig.debug14
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/Makefile_32.cpu13
-rw-r--r--arch/x86/boot/compressed/misc.c29
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/alternative.h11
-rw-r--r--arch/x86/include/asm/amd_iommu.h6
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h23
-rw-r--r--arch/x86/include/asm/amd_nb.h (renamed from arch/x86/include/asm/k8.h)21
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/apic.h14
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/calgary.h4
-rw-r--r--arch/x86/include/asm/calling.h52
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/dwarf2.h20
-rw-r--r--arch/x86/include/asm/e820.h20
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h23
-rw-r--r--arch/x86/include/asm/fixmap.h15
-rw-r--r--arch/x86/include/asm/gart.h20
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/highmem.h11
-rw-r--r--arch/x86/include/asm/hpet.h10
-rw-r--r--arch/x86/include/asm/hw_irq.h19
-rw-r--r--arch/x86/include/asm/i387.h185
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/io.h15
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/iomap.h4
-rw-r--r--arch/x86/include/asm/iommu_table.h100
-rw-r--r--arch/x86/include/asm/irq.h12
-rw-r--r--arch/x86/include/asm/irq_remapping.h35
-rw-r--r--arch/x86/include/asm/irq_vectors.h4
-rw-r--r--arch/x86/include/asm/irqflags.h32
-rw-r--r--arch/x86/include/asm/jump_label.h37
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h105
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/module.h7
-rw-r--r--arch/x86/include/asm/mrst.h10
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/mwait.h15
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h4
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h21
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/pci.h33
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/percpu.h14
-rw-r--r--arch/x86/include/asm/perf_event.h19
-rw-r--r--arch/x86/include/asm/perf_event_p4.h52
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_32.h16
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/processor.h29
-rw-r--r--arch/x86/include/asm/pvclock.h38
-rw-r--r--arch/x86/include/asm/segment.h32
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/swiotlb.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/trampoline.h3
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h21
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h189
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/x86_init.h9
-rw-r--r--arch/x86/include/asm/xen/page.h11
-rw-r--r--arch/x86/include/asm/xen/pci.h65
-rw-r--r--arch/x86/kernel/Makefile20
-rw-r--r--arch/x86/kernel/acpi/boot.c60
-rw-r--r--arch/x86/kernel/acpi/cstate.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.c15
-rw-r--r--arch/x86/kernel/alternative.c32
-rw-r--r--arch/x86/kernel/amd_iommu.c2
-rw-r--r--arch/x86/kernel/amd_iommu_init.c139
-rw-r--r--arch/x86/kernel/amd_nb.c (renamed from arch/x86/kernel/k8.c)56
-rw-r--r--arch/x86/kernel/apb_timer.c60
-rw-r--r--arch/x86/kernel/aperture_64.c31
-rw-r--r--arch/x86/kernel/apic/apic.c90
-rw-r--r--arch/x86/kernel/apic/io_apic.c891
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_64.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c36
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/check.c16
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c24
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c128
-rw-r--r--arch/x86/kernel/cpu/perf_event.c306
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c229
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c292
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c9
-rw-r--r--arch/x86/kernel/cpu/scattered.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c2
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c8
-rw-r--r--arch/x86/kernel/e820.c191
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/early_printk.c13
-rw-r--r--arch/x86/kernel/early_printk_mrst.c319
-rw-r--r--arch/x86/kernel/entry_32.S310
-rw-r--r--arch/x86/kernel/entry_64.S134
-rw-r--r--arch/x86/kernel/ftrace.c63
-rw-r--r--arch/x86/kernel/head.c3
-rw-r--r--arch/x86/kernel/head32.c11
-rw-r--r--arch/x86/kernel/head64.c9
-rw-r--r--arch/x86/kernel/head_32.S55
-rw-r--r--arch/x86/kernel/hpet.c69
-rw-r--r--arch/x86/kernel/i387.c58
-rw-r--r--arch/x86/kernel/i8259.c63
-rw-r--r--arch/x86/kernel/irq.c32
-rw-r--r--arch/x86/kernel/irq_32.c25
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c23
-rw-r--r--arch/x86/kernel/jump_label.c50
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c14
-rw-r--r--arch/x86/kernel/kprobes.c14
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c2
-rw-r--r--arch/x86/kernel/microcode_core.c3
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c3
-rw-r--r--arch/x86/kernel/mpparse.c5
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c44
-rw-r--r--arch/x86/kernel/pci-gart_64.c33
-rw-r--r--arch/x86/kernel/pci-iommu_table.c89
-rw-r--r--arch/x86/kernel/pci-swiotlb.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c17
-rw-r--r--arch/x86/kernel/pvclock.c41
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c14
-rw-r--r--arch/x86/kernel/setup.c235
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c137
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/trampoline.c26
-rw-r--r--arch/x86/kernel/traps.c36
-rw-r--r--arch/x86/kernel/tsc.c66
-rw-r--r--arch/x86/kernel/vm86_32.c10
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kernel/vmlinux.lds.S30
-rw-r--r--arch/x86/kernel/x86_init.c7
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/emulate.c2262
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/lapic.c18
-rw-r--r--arch/x86/kvm/mmu.c927
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c299
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h202
-rw-r--r--arch/x86/kvm/svm.c298
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/vmx.c243
-rw-r--r--arch/x86/kvm/x86.c803
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S158
-rw-r--r--arch/x86/lib/memmove_64.c189
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c110
-rw-r--r--arch/x86/mm/highmem_32.c76
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_32.c166
-rw-r--r--arch/x86/mm/init_64.c117
-rw-r--r--arch/x86/mm/iomap_32.c43
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/k8topology_64.c12
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/numa_32.c30
-rw-r--r--arch/x86/mm/numa_64.c91
-rw-r--r--arch/x86/mm/pgtable.c24
-rw-r--r--arch/x86/mm/srat_32.c3
-rw-r--r--arch/x86/mm/srat_64.c11
-rw-r--r--arch/x86/mm/tlb.c48
-rw-r--r--arch/x86/oprofile/backtrace.c70
-rw-r--r--arch/x86/oprofile/nmi_int.c15
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/pci/Makefile1
-rw-r--r--arch/x86/pci/acpi.c103
-rw-r--r--arch/x86/pci/common.c17
-rw-r--r--arch/x86/pci/i386.c19
-rw-r--r--arch/x86/pci/irq.c11
-rw-r--r--arch/x86/pci/mmconfig-shared.c4
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/xen.c416
-rw-r--r--arch/x86/platform/Makefile8
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c (renamed from arch/x86/kernel/efi.c)5
-rw-r--r--arch/x86/platform/efi/efi_32.c (renamed from arch/x86/kernel/efi_32.c)0
-rw-r--r--arch/x86/platform/efi/efi_64.c (renamed from arch/x86/kernel/efi_64.c)0
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S (renamed from arch/x86/kernel/efi_stub_32.S)0
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S (renamed from arch/x86/kernel/efi_stub_64.S)0
-rw-r--r--arch/x86/platform/mrst/Makefile1
-rw-r--r--arch/x86/platform/mrst/mrst.c (renamed from arch/x86/kernel/mrst.c)0
-rw-r--r--arch/x86/platform/olpc/Makefile3
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c140
-rw-r--r--arch/x86/platform/olpc/olpc.c (renamed from arch/x86/kernel/olpc.c)89
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c (renamed from arch/x86/kernel/olpc_ofw.c)6
-rw-r--r--arch/x86/platform/scx200/Makefile2
-rw-r--r--arch/x86/platform/scx200/scx200_32.c (renamed from arch/x86/kernel/scx200_32.c)0
-rw-r--r--arch/x86/platform/sfi/Makefile1
-rw-r--r--arch/x86/platform/sfi/sfi.c (renamed from arch/x86/kernel/sfi.c)4
-rw-r--r--arch/x86/platform/uv/Makefile1
-rw-r--r--arch/x86/platform/uv/bios_uv.c (renamed from arch/x86/kernel/bios_uv.c)0
-rw-r--r--arch/x86/platform/uv/tlb_uv.c (renamed from arch/x86/kernel/tlb_uv.c)25
-rw-r--r--arch/x86/platform/uv/uv_irq.c (renamed from arch/x86/kernel/uv_irq.c)55
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c (renamed from arch/x86/kernel/uv_sysfs.c)0
-rw-r--r--arch/x86/platform/uv/uv_time.c (renamed from arch/x86/kernel/uv_time.c)0
-rw-r--r--arch/x86/platform/visws/Makefile1
-rw-r--r--arch/x86/platform/visws/visws_quirks.c (renamed from arch/x86/kernel/visws_quirks.c)140
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c27
-rw-r--r--arch/x86/xen/mmu.c504
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c9
-rw-r--r--arch/x86/xen/setup.c134
-rw-r--r--arch/x86/xen/smp.c32
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/xen-ops.h3
264 files changed, 10153 insertions, 7245 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb3..0e103236b75 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -14,3 +14,4 @@ obj-y += crypto/
obj-y += vdso/
obj-$(CONFIG_IA32_EMULATION) += ia32/
+obj-y += platform/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..e8327686d3c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,6 +1,3 @@
-# x86 configuration
-mainmenu "Linux Kernel Configuration for x86"
-
# Select 32 or 64 bit
config 64BIT
bool "64-bit kernel" if ARCH = "x86"
@@ -25,14 +22,17 @@ config X86
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PERF_EVENTS if (!M386 && !M486)
+ select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
+ select HAVE_MEMBLOCK
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_C_RECORDMCOUNT
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
@@ -59,6 +59,12 @@ config X86
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_TEXT_POKE_SMP
+ select HAVE_GENERIC_HARDIRQS
+ select HAVE_SPARSE_IRQ
+ select GENERIC_IRQ_PROBE
+ select GENERIC_PENDING_IRQ if SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -193,27 +199,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
-config HAVE_EARLY_RES
- def_bool y
-
config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
- def_bool y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
-config GENERIC_IRQ_PROBE
- def_bool y
-
-config GENERIC_PENDING_IRQ
- def_bool y
- depends on GENERIC_HARDIRQS && SMP
-
config USE_GENERIC_SMP_HELPERS
def_bool y
depends on SMP
@@ -296,23 +285,6 @@ config X86_X2APIC
If you don't know what to do here, say N.
-config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on PCI_MSI || HT_IRQ
- ---help---
- This enables support for sparse irqs. This is useful for distro
- kernels that want to define a high CONFIG_NR_CPUS value but still
- want to have low kernel memory footprint on smaller machines.
-
- ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
- out the irq_desc[] array in a more NUMA-friendly way. )
-
- If you don't know what to do here, say N.
-
-config NUMA_IRQ_DESC
- def_bool y
- depends on SPARSE_IRQ && NUMA
-
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
@@ -372,6 +344,7 @@ endif
config X86_VSMP
bool "ScaleMP vSMP"
+ select PARAVIRT_GUEST
select PARAVIRT
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
@@ -517,25 +490,6 @@ if PARAVIRT_GUEST
source "arch/x86/xen/Kconfig"
-config VMI
- bool "VMI Guest support (DEPRECATED)"
- select PARAVIRT
- depends on X86_32
- ---help---
- VMI provides a paravirtualized interface to the VMware ESX server
- (it could be used by other hypervisors in theory too, but is not
- at the moment), by linking the kernel to a GPL-ed ROM module
- provided by the hypervisor.
-
- As of September 2009, VMware has started a phased retirement
- of this feature from VMware's products. Please see
- feature-removal-schedule.txt for details. If you are
- planning to enable this option, please note that you cannot
- live migrate a VMI enabled VM to a future VMware product,
- which doesn't support VMI. So if you expect your kernel to
- seamlessly migrate to newer VMware products, keep this
- disabled.
-
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
@@ -590,16 +544,7 @@ config PARAVIRT_DEBUG
a paravirt_op is missing when it is called.
config NO_BOOTMEM
- default y
- bool "Disable Bootmem code"
- ---help---
- Use early_res directly instead of bootmem before slab is ready.
- - allocator (buddy) [generic]
- - early allocator (bootmem) [generic]
- - very early allocator (reserve_early*()) [x86]
- - very very early allocator (early brk model) [x86]
- So reduce one layer between early allocator to final allocator
-
+ def_bool y
config MEMTEST
bool "Memtest"
@@ -670,7 +615,7 @@ config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
select SWIOTLB
- depends on X86_64 && PCI && K8_NB
+ depends on X86_64 && PCI && AMD_NB
---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
@@ -795,6 +740,17 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config IRQ_TIME_ACCOUNTING
+ bool "Fine granularity task level IRQ time accounting"
+ default n
+ ---help---
+ Select this option to enable fine granularity task irq time
+ accounting. This is done by reading a timestamp on each
+ transitions between softirq and hardirq state, so there can be a
+ small performance impact.
+
+ If in doubt, say N here.
+
source "kernel/Kconfig.preempt"
config X86_UP_APIC
@@ -1148,6 +1104,9 @@ config X86_PAE
config ARCH_PHYS_ADDR_T_64BIT
def_bool X86_64 || X86_PAE
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool X86_64 || HIGHMEM64G
+
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
default y
@@ -1326,25 +1285,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
- default y
+config X86_RESERVE_LOW
+ int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ default 64
+ range 4 640
---help---
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
+ Specify the amount of low memory to reserve for the BIOS.
+
+ The first page contains BIOS data structures that the kernel
+ must not use, so that page must always be reserved.
+
+ By default we reserve the first 64K of physical RAM, as a
+ number of BIOSes are known to corrupt that memory range
+ during events such as suspend/resume or monitor cable
+ insertion, so it must not be used by the kernel.
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
+ You can set this to 4 if you are absolutely sure that you
+ trust the BIOS to get all its memory reservations and usages
+ right. If you know your BIOS have problems beyond the
+ default 64K area, you can set this to 640 to avoid using the
+ entire low memory range.
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
+ If you have doubts about the BIOS (e.g. suspend/resume does
+ not work or there's kernel crashes after certain hardware
+ hotplug events) then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+ typical corruption patterns.
- Say Y if unsure.
+ Leave this to the default value of 64 if you are unsure.
config MATH_EMULATION
bool
@@ -1900,7 +1868,7 @@ config PCI_GODIRECT
bool "Direct"
config PCI_GOOLPC
- bool "OLPC"
+ bool "OLPC XO-1"
depends on OLPC
config PCI_GOANY
@@ -1925,6 +1893,11 @@ config PCI_OLPC
def_bool y
depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+config PCI_XEN
+ def_bool y
+ depends on PCI && XEN
+ select SWIOTLB_XEN
+
config PCI_DOMAINS
def_bool y
depends on PCI
@@ -2061,14 +2034,21 @@ config SCx200HR_TIMER
config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
+ select OLPC_OPENFIRMWARE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
+config OLPC_XO1
+ tristate "OLPC XO-1 support"
+ depends on OLPC && PCI
+ ---help---
+ Add support for non-essential features of the OLPC XO-1 laptop.
+
config OLPC_OPENFIRMWARE
bool "Support for OLPC's Open Firmware"
depends on !X86_64 && !X86_PAE
- default y if OLPC
+ default n
help
This option adds support for the implementation of Open Firmware
that is used on the OLPC XO-1 Children's Machine.
@@ -2076,7 +2056,7 @@ config OLPC_OPENFIRMWARE
endif # X86_32
-config K8_NB
+config AMD_NB
def_bool y
depends on CPU_SUP_AMD && PCI
@@ -2125,6 +2105,10 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
+config HAVE_TEXT_POKE_SMP
+ bool
+ select STOP_MACHINE if SMP
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63..b59ee765414 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
+config EARLY_PRINTK_MRST
+ bool "Early printk for MRST platform support"
+ depends on EARLY_PRINTK && X86_MRST
+
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
@@ -121,16 +125,6 @@ config DEBUG_NX_TEST
and the software setup of this feature.
If in doubt, say "N"
-config 4KSTACKS
- bool "Use 4Kb for kernel stacks instead of 8Kb"
- depends on X86_32
- ---help---
- If you say Y here the kernel will use a 4Kb stacksize for the
- kernel stack attached to each process/thread. This facilitates
- running more threads on a system and also reduces the pressure
- on the VM subsystem for higher order allocations. This option
- will also use IRQ stacks to compensate for the reduced stackspace.
-
config DOUBLEFAULT
default y
bool "Enable doublefault exception handler" if EMBEDDED
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b..b02e509072a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 1255d953c65..f2ee1abb1df 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -51,7 +51,18 @@ cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
# tracer assumptions. For i686, generic, core2 this is set by the
# compiler anyway
-cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
+ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+# Work around to a bug with asm goto with first implementations of it
+# in gcc causing gcc to mess up the push and pop of the stack in some
+# uses of asm goto.
+ifeq ($(CONFIG_JUMP_LABEL), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args)
# Bug fix for binutils: this option is required in order to keep
# binutils from generating NOPL instructions against our will.
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8f7bef8e9ff..23f315c9f21 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -229,18 +229,35 @@ void *memset(void *s, int c, size_t n)
ss[i] = c;
return s;
}
-
+#ifdef CONFIG_X86_32
void *memcpy(void *dest, const void *src, size_t n)
{
- int i;
- const char *s = src;
- char *d = dest;
+ int d0, d1, d2;
+ asm volatile(
+ "rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+ : "memory");
- for (i = 0; i < n; i++)
- d[i] = s[i];
return dest;
}
+#else
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep ; movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
+ return dest;
+}
+#endif
static void error(char *x)
{
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 92091de1111..55d106b5e31 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -93,6 +93,9 @@ extern u8 acpi_sci_flags;
extern int acpi_sci_override_gsi;
void acpi_pic_sci_set_trigger(unsigned int, u16);
+extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity);
+
static inline void disable_acpi(void)
{
acpi_disabled = 1;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bc6abb7bc7e..76561d20ea2 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/stringify.h>
+#include <linux/jump_label.h>
#include <asm/asm.h>
/*
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
/*
* Clear and restore the kernel write-protection flag on the local CPU.
* Allows the kernel to edit read-only pages.
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b..a6863a2dec1 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -24,11 +24,11 @@
#ifdef CONFIG_AMD_IOMMU
-extern void amd_iommu_detect(void);
+extern int amd_iommu_detect(void);
#else
-static inline void amd_iommu_detect(void) { }
+static inline int amd_iommu_detect(void) { return -ENODEV; }
#endif
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90..916bc8111a0 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
*
* This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180dea..e3509fc303b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -416,13 +416,22 @@ struct amd_iommu {
struct dma_ops_domain *default_dom;
/*
- * This array is required to work around a potential BIOS bug.
- * The BIOS may miss to restore parts of the PCI configuration
- * space when the system resumes from S3. The result is that the
- * IOMMU does not execute commands anymore which leads to system
- * failure.
+ * We can't rely on the BIOS to restore all values on reinit, so we
+ * need to stash them
*/
- u32 cache_cfg[4];
+
+ /* The iommu BAR */
+ u32 stored_addr_lo;
+ u32 stored_addr_hi;
+
+ /*
+ * Each iommu has 6 l1s, each of which is documented as having 0x12
+ * registers
+ */
+ u32 stored_l1[6][0x12];
+
+ /* The l2 indirect registers */
+ u32 stored_l2[0x83];
};
/*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
index af00bd1d208..c8517f81b21 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
#include <linux/pci.h>
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
struct bootnode;
extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_get_nodes(struct bootnode *nodes);
extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int k8_scan_nodes(void);
-#ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
+struct k8_northbridge_info {
+ u16 num;
+ u8 gart_supported;
+ struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
- return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+ return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
}
#else
-#define num_k8_northbridges 0
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
#endif
-#endif /* _ASM_X86_K8_H */
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf..2fefa501d3b 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae4..f6ce0bda3b9 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
static inline u32 native_apic_msr_read(u32 reg)
{
- u32 low, high;
+ u64 msr;
if (reg == APIC_DFR)
return -1;
- rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
- return low;
+ rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
+ return (u32)msr;
}
static inline void native_x2apic_wait_icr_idle(void)
@@ -181,12 +181,12 @@ extern void enable_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
static inline int x2apic_enabled(void)
{
- int msr, msr2;
+ u64 msr;
if (!cpu_has_x2apic)
return 0;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ rdmsrl(MSR_IA32_APICBASE, msr);
if (msr & X2APIC_ENABLE)
return 1;
return 0;
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
}
#endif
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
-
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f0..a859ca461fb 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
#define APIC_EILVT_MSG_FIX 0x0
#define APIC_EILVT_MSG_SMI 0x2
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index bafd80defa4..903683b07e4 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -440,6 +440,8 @@ static inline int fls(int x)
#ifdef __KERNEL__
+#include <asm-generic/bitops/find.h>
+
#include <asm-generic/bitops/sched.h>
#define ARCH_HAS_FAST_MULTIPLIER 1
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0918654305a..0d467b33883 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,9 +62,9 @@ struct cal_chipset_ops {
extern int use_calgary;
#ifdef CONFIG_CALGARY_IOMMU
-extern void detect_calgary(void);
+extern int detect_calgary(void);
#else
-static inline void detect_calgary(void) { return; }
+static inline int detect_calgary(void) { return -ENODEV; }
#endif
#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0e63c9a2a8d..30af5a83216 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with
/*
- * 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
+ * 64-bit system call stack frame layout defines and helpers, for
+ * assembly code (note that the seemingly unnecessary parentheses
+ * are to prevent cpp from inserting spaces in expressions that get
+ * passed to macros):
*/
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
+#define R15 (0)
+#define R14 (8)
+#define R13 (16)
+#define R12 (24)
+#define RBP (32)
+#define RBX (40)
/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
+#define R11 (48)
+#define R10 (56)
+#define R9 (64)
+#define R8 (72)
+#define RAX (80)
+#define RCX (88)
+#define RDX (96)
+#define RSI (104)
+#define RDI (112)
+#define ORIG_RAX (120) /* + error_code */
/* end of arguments */
/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
+#define RIP (128)
+#define CS (136)
+#define EFLAGS (144)
+#define RSP (152)
+#define SS (160)
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#define ARG_SKIP 9*8
+#define ARG_SKIP (9*8)
.macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
skipr8910=0, skiprdx=0
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#define REST_SKIP 6*8
+#define REST_SKIP (6*8)
.macro SAVE_REST
subq $REST_SKIP, %rsp
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19..4fab24de26b 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
DECLARE_PER_CPU(int, cpu_state);
-extern unsigned int boot_cpu_id;
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589a..220e2ea08e8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a..32609919931 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
CFI_ADJUST_CFA_OFFSET -8
.endm
+ .macro pushfq_cfi
+ pushfq
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popfq_cfi
+ popfq
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
.macro movq_cfi reg offset=0
movq %\reg, \offset(%rsp)
CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
CFI_ADJUST_CFA_OFFSET -4
.endm
+ .macro pushfl_cfi
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ .endm
+
+ .macro popfl_cfi
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
+ .endm
+
.macro movl_cfi reg offset=0
movl %\reg, \offset(%esp)
CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d14ab..5be1542fbfa 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -112,23 +112,13 @@ static inline void early_memtest(unsigned long start, unsigned long end)
}
#endif
-extern unsigned long end_user_pfn;
-
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-#include <linux/early_res.h>
-
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
-extern int e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
-extern u64 e820_hole_size(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+void memblock_x86_fill(void);
+void memblock_find_dma_reserve(void);
+
extern void finish_e820_parsing(void);
extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f992..8e4a16508d4 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
-extern void efi_reserve_early(void);
+extern void efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98..57650ab4a5f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
-BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
+.irpc idx, "01234567"
+BUILD_INTERRUPT3(invalidate_interrupt\idx,
+ (INVALIDATE_TLB_VECTOR_START)+\idx,
smp_invalidate_interrupt)
+.endr
#endif
BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
@@ -49,8 +38,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1d..4d293dced62 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
return __virt_to_fix(vaddr);
}
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ __set_fixmap(idx, phys, flags);
+ return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc..43085bfc99c 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
#define GARTEN (1<<0)
#define DISGARTCPU (1<<4)
#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
/* GART cache control register bits. */
#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
#define AMD64_GARTAPERTUREBASE 0x94
#define AMD64_GARTTABLEBASE 0x98
#define AMD64_GARTCACHECTL 0x9c
-#define AMD64_GARTEN (1<<0)
#ifdef CONFIG_GART_IOMMU
extern int gart_iommu_aperture;
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
+extern int gart_iommu_hole_init(void);
#else
#define gart_iommu_aperture 0
@@ -50,13 +50,27 @@ static inline void early_gart_iommu_check(void)
static inline void gart_parse_options(char *options)
{
}
-static inline void gart_iommu_hole_init(void)
+static inline int gart_iommu_hole_init(void)
{
+ return -ENODEV;
}
#endif
extern int agp_amd64_init(void);
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = DISTLBWALKPRB | order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee61..55e4de613f0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
- unsigned int apic_pending_irqs;
+ unsigned int apic_irq_work_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 8caac76ac32..3bd04022fd0 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
void *kmap(struct page *page);
void kunmap(struct page *page);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdf..2c392d663dc 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+struct irq_data;
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+struct hpet_dev;
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
#ifdef CONFIG_PCI_MSI
extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f23..0274ec5a7e6 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
extern void apic_timer_interrupt(void);
extern void x86_platform_ipi(void);
extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
irq_attr->polarity = polarity;
}
+struct irq_2_iommu {
+ struct intel_iommu *iommu;
+ u16 irte_index;
+ u16 sub_handle;
+ u8 irte_mask;
+};
+
/*
* This is performance-critical, we want to do it O(1)
*
@@ -89,15 +96,17 @@ struct irq_cfg {
cpumask_var_t old_domain;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_INTR_REMAP
+ struct irq_2_iommu irq_2_iommu;
+#endif
};
-extern struct irq_cfg *irq_cfg(unsigned int);
extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
extern void send_cleanup_vector(struct irq_cfg *);
-struct irq_desc;
-extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
- unsigned int *dest_id);
+struct irq_data;
+int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
+ unsigned int *dest_id);
extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e6..4aa2bb3b242 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
extern int restore_i387_xstate_ia32(void __user *buf);
#endif
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
#define X87_FSW_ES (1 << 7) /* Exception Summary */
static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
return static_cpu_has(X86_FEATURE_XSAVE);
}
+static __always_inline __pure bool use_fxsr(void)
+{
+ return static_cpu_has(X86_FEATURE_FXSR);
+}
+
extern void __sanitize_i387_state(struct task_struct *);
static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
}
#ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
-}
-
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
int err;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "m" (*fx), "0" (0));
return err;
}
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. The kernel data segment can be sometimes 0 and sometimes
- new user value. Both should be ok.
- Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- /*
- * xsave header may indicate the init state of the FP.
- */
- if (use_xsave() &&
- !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- return;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
- alternative_input(ASM_NOP8 ASM_NOP2,
- " emms\n" /* clear stack tags */
- " fildl %%gs:0", /* load to clear state */
- X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
- fpu_clear(&tsk->thread.fpu);
-}
-
static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
{
int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
if (unlikely(err))
return -EFAULT;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "0" (0));
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
uses any extended registers for addressing, a second REX prefix
will be generated (to the assembler, rex64 followed by semicolon
is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+
+#ifdef CONFIG_AS_FXSAVEQ
/* Using "fxsaveq %0" would be the ideal choice, but is only supported
starting with gas 2.16. */
__asm__ __volatile__("fxsaveq %0"
: "=m" (fpu->state->fxsave));
-#elif 0
+#else
/* Using, as a workaround, the properly prefixed form below isn't
accepted by any binutils version so far released, complaining that
the same type of prefix is used twice if an extended register is
- needed for addressing (fix submitted to mainline 2005-11-21). */
- __asm__ __volatile__("rex64/fxsave %0"
- : "=m" (fpu->state->fxsave));
-#else
- /* This, however, we can work around by forcing the compiler to select
+ needed for addressing (fix submitted to mainline 2005-11-21).
+ asm volatile("rex64/fxsave %0"
+ : "=m" (fpu->state->fxsave));
+ This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (fpu->state->fxsave)
- : "cdaSDb" (&fpu->state->fxsave));
+ asm volatile("rex64/fxsave (%[fx])"
+ : "=m" (fpu->state->fxsave)
+ : [fx] "R" (&fpu->state->fxsave));
#endif
}
-static inline void fpu_save_init(struct fpu *fpu)
-{
- if (use_xsave())
- fpu_xsave(fpu);
- else
- fpu_fxsave(fpu);
-
- fpu_clear(fpu);
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- fpu_save_init(&tsk->thread.fpu);
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
#else /* CONFIG_X86_32 */
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-static inline void tolerant_fwait(void)
-{
- asm volatile("fnclex ; fwait");
-}
-
/* perform fxrstor iff the processor has extended states, otherwise frstor */
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return 0;
}
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+ asm volatile("fxsave %[fx]"
+ : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif /* CONFIG_X86_64 */
+
/* We need a safe address that is cheap to find and that is already
in L1 during context switch. The best choices are unfortunately
different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
static inline void fpu_save_init(struct fpu *fpu)
{
if (use_xsave()) {
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
fpu_xsave(fpu);
/*
* xsave header may indicate the init state of the FP.
*/
- if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- goto end;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
-
- /*
- * we can do a simple return here or be paranoid :)
- */
- goto clear_state;
+ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+ return;
+ } else if (use_fxsr()) {
+ fpu_fxsave(fpu);
+ } else {
+ asm volatile("fsave %[fx]; fwait"
+ : [fx] "=m" (fpu->state->fsave));
+ return;
}
- /* Use more nops than strictly needed in case the compiler
- varies code */
- alternative_input(
- "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
- "fxsave %[fx]\n"
- "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
- X86_FEATURE_FXSR,
- [fx] "m" (fpu->state->fxsave),
- [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+ if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+ asm volatile("fnclex");
+
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. safe_address is a random variable that should be in L1 */
alternative_input(
- GENERIC_NOP8 GENERIC_NOP2,
+ ASM_NOP8 ASM_NOP2,
"emms\n\t" /* clear stack tags */
- "fildl %[addr]", /* set F?P to defined value */
+ "fildl %P[addr]", /* set F?P to defined value */
X86_FEATURE_FXSAVE_LEAK,
[addr] "m" (safe_address));
-end:
- ;
}
static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
}
-
-#endif /* CONFIG_X86_64 */
-
static inline int fpu_fxrstor_checking(struct fpu *fpu)
{
return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
static inline void __clear_fpu(struct task_struct *tsk)
{
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- tolerant_fwait();
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
}
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
stts();
}
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- __save_init_fpu(tsk);
- stts();
-}
-
-#define unlazy_fpu __unlazy_fpu
-#define clear_fpu __clear_fpu
-
-#else /* CONFIG_X86_32 */
-
/*
* These disable preemption on their own and are safe
*/
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
preempt_enable();
}
-#endif /* CONFIG_X86_64 */
-
/*
* i387 state interaction
*/
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
#endif /* __ASSEMBLY__ */
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646a..a20365953bf 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
struct legacy_pic {
int nr_legacy_irqs;
struct irq_chip *chip;
+ void (*mask)(unsigned int irq);
+ void (*unmask)(unsigned int irq);
void (*mask_all)(void);
void (*restore_mask)(void);
void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e977612..07227308252 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -41,6 +41,8 @@
#include <asm-generic/int-ll64.h>
#include <asm/page.h>
+#include <xen/xen.h>
+
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -206,6 +208,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
extern void iounmap(volatile void __iomem *addr);
+extern void set_iounmap_nonlazy(void);
#ifdef __KERNEL__
@@ -348,6 +351,18 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#ifdef CONFIG_XEN
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
+ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif /* CONFIG_XEN */
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2..a6b28d017c2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -169,13 +169,8 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void probe_nr_irqs_gsi(void);
+extern int get_nr_irqs_gsi(void);
-extern int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
- struct IO_APIC_route_entry e);
extern void setup_ioapic_ids_from_mpc(void);
struct mp_ioapic_gsi{
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c4191b3b705..363e33eb6ec 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -27,10 +27,10 @@
#include <asm/tlbflush.h>
void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type);
+iounmap_atomic(void __iomem *kvaddr);
int
iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 00000000000..f229b13a5f3
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_IOMMU_TABLE_H
+#define _ASM_X86_IOMMU_TABLE_H
+
+#include <asm/swiotlb.h>
+
+/*
+ * History lesson:
+ * The execution chain of IOMMUs in 2.6.36 looks as so:
+ *
+ * [xen-swiotlb]
+ * |
+ * +----[swiotlb *]--+
+ * / | \
+ * / | \
+ * [GART] [Calgary] [Intel VT-d]
+ * /
+ * /
+ * [AMD-Vi]
+ *
+ * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
+ * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
+ * Also it would surreptitiously initialize set the swiotlb=1 if there were
+ * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
+ * flag would be turned off by all IOMMUs except the Calgary one.
+ *
+ * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
+ * to be built by defining who we depend on.
+ *
+ * And all that needs to be done is to use one of the macros in the IOMMU
+ * and the pci-dma.c will take care of the rest.
+ */
+
+struct iommu_table_entry {
+ initcall_t detect;
+ initcall_t depend;
+ void (*early_init)(void); /* No memory allocate available. */
+ void (*late_init)(void); /* Yes, can allocate memory. */
+#define IOMMU_FINISH_IF_DETECTED (1<<0)
+#define IOMMU_DETECTED (1<<1)
+ int flags;
+};
+/*
+ * Macro fills out an entry in the .iommu_table that is equivalent
+ * to the fields that 'struct iommu_table_entry' has. The entries
+ * that are put in the .iommu_table section are not put in any order
+ * hence during boot-time we will have to resort them based on
+ * dependency. */
+
+
+#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
+ static const struct iommu_table_entry const \
+ __iommu_entry_##_detect __used \
+ __attribute__ ((unused, __section__(".iommu_table"), \
+ aligned((sizeof(void *))))) \
+ = {_detect, _depend, _early_init, _late_init, \
+ _finish ? IOMMU_FINISH_IF_DETECTED : 0}
+/*
+ * The simplest IOMMU definition. Provide the detection routine
+ * and it will be run after the SWIOTLB and the other IOMMUs
+ * that utilize this macro. If the IOMMU is detected (ie, the
+ * detect routine returns a positive value), the other IOMMUs
+ * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
+ * to stop detecting the other IOMMUs after yours has been detected.
+ */
+#define IOMMU_INIT_POST(_detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0)
+
+#define IOMMU_INIT_POST_FINISH(detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1)
+
+/*
+ * A more sophisticated version of IOMMU_INIT. This variant requires:
+ * a). A detection routine function.
+ * b). The name of the detection routine we depend on to get called
+ * before us.
+ * c). The init routine which gets called if the detection routine
+ * returns a positive value from the pci_iommu_alloc. This means
+ * no presence of a memory allocator.
+ * d). Similar to the 'init', except that this gets called from pci_iommu_init
+ * where we do have a memory allocator.
+ *
+ * The standard vs the _FINISH differs in that the _FINISH variant will
+ * continue detecting other IOMMUs in the call list after the
+ * the detection routine returns a positive number. The _FINISH will
+ * stop the execution chain. Both will still call the 'init' and
+ * 'late_init' functions if they are set.
+ */
+#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
+
+#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
+
+void sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+void check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef..13b0ebaa512 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -19,18 +19,14 @@ static inline int irq_canonicalize(int irq)
# define ARCH_HAS_NMI_WATCHDOG
#endif
-#ifdef CONFIG_4KSTACKS
- extern void irq_ctx_init(int cpu);
- extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
+#ifdef CONFIG_X86_32
+extern void irq_ctx_init(int cpu);
#else
# define irq_ctx_init(cpu) do { } while (0)
-# define irq_ctx_exit(cpu) do { } while (0)
-# ifdef CONFIG_X86_64
-# define __ARCH_HAS_DO_SOFTIRQ
-# endif
#endif
+#define __ARCH_HAS_DO_SOFTIRQ
+
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
extern void fixup_irqs(void);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e224450..1c23360fb2d 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+ unsigned int dest)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments (in io_apic.c) explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte->trigger_mode = 0;
+ irte->dlvry_mode = apic->irq_delivery_mode;
+ irte->vector = vector;
+ irte->dest_id = IRTE_DEST(dest);
+ irte->redir_hint = 1;
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca3009255..6af0894dafb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
#define X86_PLATFORM_IPI_VECTOR 0xed
/*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
*/
-#define LOCAL_PENDING_VECTOR 0xec
+#define IRQ_WORK_VECTOR 0xec
#define UV_BAU_MESSAGE 0xea
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9e2b952f810..5745ce8bf10 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -61,22 +61,22 @@ static inline void native_halt(void)
#else
#ifndef __ASSEMBLY__
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline void raw_local_irq_restore(unsigned long flags)
+static inline void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -102,12 +102,10 @@ static inline void halt(void)
/*
* For spinlocks, etc:
*/
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
{
- unsigned long flags = __raw_local_save_flags();
-
- raw_local_irq_disable();
-
+ unsigned long flags = arch_local_save_flags();
+ arch_local_irq_disable();
return flags;
}
#else
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void)
#endif /* CONFIG_PARAVIRT */
#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
- do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
+static inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
}
-static inline int raw_irqs_disabled(void)
+static inline int arch_irqs_disabled(void)
{
- unsigned long flags = __raw_local_save_flags();
+ unsigned long flags = arch_local_save_flags();
- return raw_irqs_disabled_flags(flags);
+ return arch_irqs_disabled_flags(flags);
}
#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 00000000000..f52d42e8058
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/nops.h>
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label) \
+ do { \
+ asm goto("1:" \
+ JUMP_LABEL_INITIAL_NOP \
+ ".pushsection __jump_table, \"a\" \n\t"\
+ _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+ ".popsection \n\t" \
+ : : "i" (key) : : label); \
+ } while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+ jump_label_t code;
+ jump_label_t target;
+ jump_label_t key;
+};
+
+#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e..b36c6b3fe14 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+ void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
int (*cpl)(struct kvm_vcpu *vcpu);
@@ -156,7 +157,10 @@ struct operand {
unsigned long orig_val;
u64 orig_val64;
};
- unsigned long *ptr;
+ union {
+ unsigned long *reg;
+ unsigned long mem;
+ } addr;
union {
unsigned long val;
u64 val64;
@@ -190,6 +194,7 @@ struct decode_cache {
bool has_seg_override;
u8 seg_override;
unsigned int d;
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
unsigned long regs[NR_VCPU_REGS];
unsigned long eip;
/* modrm */
@@ -197,17 +202,16 @@ struct decode_cache {
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
- u8 use_modrm_ea;
+ u8 modrm_seg;
bool rip_relative;
- unsigned long modrm_ea;
- void *modrm_ptr;
- unsigned long modrm_val;
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
};
struct x86_emulate_ctxt {
+ struct x86_emulate_ops *ops;
+
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
@@ -220,12 +224,11 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
- bool restart; /* restart string instruction after writeback */
+ bool perm_ok; /* do not check permissions if true */
int exception; /* exception that happens during emulation or -1 */
u32 error_code; /* error code for exception */
bool error_code_valid;
- unsigned long cr2; /* faulted address in case of #PF */
/* decode cache */
struct decode_cache decode;
@@ -249,13 +252,14 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502e53f999c..9e6fe391094 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,10 +236,14 @@ struct kvm_pio_request {
*/
struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+ unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
u32 *error);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -249,13 +253,18 @@ struct kvm_mmu {
int root_level;
int shadow_root_level;
union kvm_mmu_page_role base_role;
+ bool direct_map;
u64 *pae_root;
+ u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+
+ bool nx;
+
+ u64 pdptrs[4]; /* pae */
};
struct kvm_vcpu_arch {
- u64 host_tsc;
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -272,7 +281,6 @@ struct kvm_vcpu_arch {
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
- u64 pdptrs[4]; /* pae */
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
@@ -282,7 +290,41 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+ /*
+ * Paging state of the vcpu
+ *
+ * If the vcpu runs in guest mode with two level paging this still saves
+ * the paging mode of the l1 guest. This context is always used to
+ * handle faults.
+ */
struct kvm_mmu mmu;
+
+ /*
+ * Paging state of an L2 guest (used for nested npt)
+ *
+ * This context will save all necessary information to walk page tables
+ * of the an L2 guest. This context is only initialized for page table
+ * walking and not for faulting since we never handle l2 page faults on
+ * the host.
+ */
+ struct kvm_mmu nested_mmu;
+
+ /*
+ * Pointer to the mmu context currently used for
+ * gva_to_gpa translations.
+ */
+ struct kvm_mmu *walk_mmu;
+
+ /*
+ * This struct is filled with the necessary information to propagate a
+ * page fault into the guest
+ */
+ struct {
+ u64 address;
+ unsigned error_code;
+ bool nested;
+ } fault;
+
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -336,9 +378,15 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
+ unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+ u64 last_host_tsc;
+ u64 last_guest_tsc;
+ u64 last_kernel_ns;
+ u64 last_tsc_nsec;
+ u64 last_tsc_write;
+ bool tsc_catchup;
bool nmi_pending;
bool nmi_injected;
@@ -367,9 +415,9 @@ struct kvm_vcpu_arch {
};
struct kvm_arch {
- unsigned int n_free_mmu_pages;
+ unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
- unsigned int n_alloc_mmu_pages;
+ unsigned int n_max_mmu_pages;
atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -394,8 +442,14 @@ struct kvm_arch {
gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
- u64 vm_init_tsc;
s64 kvmclock_offset;
+ spinlock_t tsc_write_lock;
+ u64 last_tsc_nsec;
+ u64 last_tsc_offset;
+ u64 last_tsc_write;
+ u32 virtual_tsc_khz;
+ u32 virtual_tsc_mult;
+ s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -505,6 +559,7 @@ struct kvm_x86_ops {
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject);
+ void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -517,11 +572,16 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
+ void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+
+ void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
bool (*has_wbinvd_exit)(void);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
- u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t gfn, void *data, int offset, int len,
+ u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -652,20 +715,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
return (struct kvm_mmu_page *)page_private(page);
}
-static inline u16 kvm_read_fs(void)
-{
- u16 seg;
- asm("mov %%fs, %0" : "=g"(seg));
- return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
- u16 seg;
- asm("mov %%gs, %0" : "=g"(seg));
- return seg;
-}
-
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -673,16 +722,6 @@ static inline u16 kvm_read_ldt(void)
return ldt;
}
-static inline void kvm_load_fs(u16 sel)
-{
- asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
- asm("mov %0, %%gs" : : "rm"(sel));
-}
-
static inline void kvm_load_ldt(u16 sel)
{
asm("lldt %0" : : "rm"(sel));
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e..7b562b6184b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
return cpuid_eax(KVM_CPUID_FEATURES);
}
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
#endif
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
new file mode 100644
index 00000000000..19ae14ba697
--- /dev/null
+++ b/arch/x86/include/asm/memblock.h
@@ -0,0 +1,23 @@
+#ifndef _X86_MEMBLOCK_H
+#define _X86_MEMBLOCK_H
+
+#define ARCH_DISCARD_MEMBLOCK
+
+u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
+void memblock_x86_to_bootmem(u64 start, u64 end);
+
+void memblock_x86_reserve_range(u64 start, u64 end, char *name);
+void memblock_x86_free_range(u64 start, u64 end);
+struct range;
+int __get_free_all_memory_range(struct range **range, int nodeid,
+ unsigned long start_pfn, unsigned long end_pfn);
+int get_free_all_memory_range(struct range **rangep, int nodeid);
+
+void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn);
+u64 memblock_x86_hole_size(u64 start, u64 end);
+u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
+u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
+u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
+
+#endif
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 3e2ce58a31a..67763c5d8b4 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -60,12 +60,7 @@
#endif
#ifdef CONFIG_X86_32
-# ifdef CONFIG_4KSTACKS
-# define MODULE_STACKSIZE "4KSTACKS "
-# else
-# define MODULE_STACKSIZE ""
-# endif
-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
#endif
#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf..4a711a684b1 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
*/
#ifndef _ASM_X86_MRST_H
#define _ASM_X86_MRST_H
+
+#include <linux/sfi.h>
+
extern int pci_mrst_init(void);
int __init sfi_parse_mrtc(struct sfi_table_header *table);
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
};
extern enum mrst_cpu_type __mrst_cpu_chip;
-static enum mrst_cpu_type mrst_identify_cpu(void)
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
{
return __mrst_cpu_chip;
}
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb..3ea3dc48704 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,6 +121,7 @@
#define MSR_AMD64_IBSDCLINAD 0xc0011038
#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBSBRTARGET 0xc001103b
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
@@ -198,6 +199,7 @@
#define MSR_IA32_TSC 0x00000010
#define MSR_IA32_PLATFORM_ID 0x00000017
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_EBC_FREQUENCY_ID 0x0000002c
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
#define FEATURE_CONTROL_LOCKED (1<<0)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 00000000000..bcdff997668
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_MAX_NUM_CSTATES 8
+
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 101229b0d8e..42a978c0c1b 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits);
/* EC commands */
#define EC_FIRMWARE_REV 0x08
+#define EC_WLAN_ENTER_RESET 0x35
+#define EC_WLAN_LEAVE_RESET 0x25
/* SCI source values */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3..2a8478140bb 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
/* install OFW's pde permanently into the kernel's pgtable */
extern void setup_olpc_ofw_pgd(void);
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
#else /* !CONFIG_OLPC_OPENFIRMWARE */
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
#endif /* !CONFIG_OLPC_OPENFIRMWARE */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313..ade619ff9e2 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,11 +15,7 @@
*/
#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
-#ifdef CONFIG_4KSTACKS
-#define THREAD_ORDER 0
-#else
#define THREAD_ORDER 1
-#endif
#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
#define STACKFAULT_STACK 0
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c725..1df66211fd1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e..18e3b8a8709 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -105,7 +105,7 @@ static inline void write_cr8(unsigned long x)
}
#endif
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
{
PVOP_VCALL0(pv_irq_ops.safe_halt);
}
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
}
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
- unsigned long start, unsigned long count)
-{
- PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
static inline void paravirt_release_pmd(unsigned long pfn)
{
PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
@@ -829,32 +824,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
-static inline void raw_local_irq_restore(unsigned long f)
+static inline void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
{
unsigned long f;
- f = __raw_local_save_flags();
- raw_local_irq_disable();
+ f = arch_local_save_flags();
+ arch_local_irq_disable();
return f;
}
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef553234..b82bac97525 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
*/
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
- void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d395540ff89..ca0437c714b 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
#include <linux/string.h>
#include <asm/scatterlist.h>
#include <asm/io.h>
+#include <asm/x86_init.h>
#ifdef __KERNEL__
@@ -94,8 +95,36 @@ static inline void early_quirks(void) { }
extern void pci_iommu_alloc(void);
-/* MSI arch hook */
-#define arch_setup_msi_irqs arch_setup_msi_irqs
+#ifdef CONFIG_PCI_MSI
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+/* implemented in arch/x86/kernel/apic/io_apic. */
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+#else
+#define native_setup_msi_irqs NULL
+#define native_teardown_msi_irq NULL
+#define default_teardown_msi_irqs NULL
+#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 49c7219826f..704526734be 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -47,6 +47,7 @@ enum pci_bf_sort_state {
extern unsigned int pcibios_max_latency;
void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
/* pci-pc.c */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910..f899e01a8ac 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -47,6 +47,20 @@
#ifdef CONFIG_SMP
#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
#define __my_cpu_offset percpu_read(this_cpu_off)
+
+/*
+ * Compared to the generic __my_cpu_offset version, the following
+ * saves one instruction and avoids clobbering a temp register.
+ */
+#define __this_cpu_ptr(ptr) \
+({ \
+ unsigned long tcp_ptr__; \
+ __verify_pcpu_ptr(ptr); \
+ asm volatile("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
+ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+})
#else
#define __percpu_arg(x) "%P" #x
#endif
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6e742cc4251..550e26b1dbb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -111,17 +111,18 @@ union cpuid10_edx {
#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
/* IbsFetchCtl bits/masks */
-#define IBS_FETCH_RAND_EN (1ULL<<57)
-#define IBS_FETCH_VAL (1ULL<<49)
-#define IBS_FETCH_ENABLE (1ULL<<48)
-#define IBS_FETCH_CNT 0xFFFF0000ULL
-#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT 0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
/* IbsOpCtl bits */
-#define IBS_OP_CNT_CTL (1ULL<<19)
-#define IBS_OP_VAL (1ULL<<18)
-#define IBS_OP_ENABLE (1ULL<<17)
-#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#ifdef CONFIG_PERF_EVENTS
extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index def500776b1..a70cd216be5 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
-/* Non HT mask */
-#define P4_ESCR_MASK \
- (P4_ESCR_EVENT_MASK | \
- P4_ESCR_EVENTMASK_MASK | \
- P4_ESCR_TAG_MASK | \
- P4_ESCR_TAG_ENABLE | \
- P4_ESCR_T0_OS | \
- P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT \
- (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
#define P4_CCCR_OVF 0x80000000U
#define P4_CCCR_CASCADE 0x40000000U
#define P4_CCCR_OVF_PMI_T0 0x04000000U
@@ -70,23 +57,6 @@
#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
-/* Non HT mask */
-#define P4_CCCR_MASK \
- (P4_CCCR_OVF | \
- P4_CCCR_CASCADE | \
- P4_CCCR_OVF_PMI_T0 | \
- P4_CCCR_FORCE_OVF | \
- P4_CCCR_EDGE | \
- P4_CCCR_THRESHOLD_MASK | \
- P4_CCCR_COMPLEMENT | \
- P4_CCCR_COMPARE | \
- P4_CCCR_ESCR_SELECT_MASK | \
- P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT \
- (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
#define P4_GEN_ESCR_EMASK(class, name, bit) \
class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_EMASK_BIT(class, name) class##__##name
@@ -127,6 +97,28 @@
#define P4_CONFIG_HT_SHIFT 63
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR \
+ P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
+ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a6..ada823a13c7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
pte_update(mm, addr, ptep);
}
+#define flush_tlb_fix_spurious_fault(vma, address)
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f686f49e8b7..0c92113c4cb 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,7 +26,7 @@ struct mm_struct;
struct vm_area_struct;
extern pgd_t swapper_pg_dir[1024];
-extern pgd_t trampoline_pg_dir[1024];
+extern pgd_t initial_page_table[1024];
static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
#endif
#if defined(CONFIG_HIGHPTE)
-#define __KM_PTE \
- (in_nmi() ? KM_NMI_PTE : \
- in_irq() ? KM_IRQ_PTE : \
- KM_PTE0)
#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
pte_index((address)))
-#define pte_offset_map_nested(dir, address) \
- ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
- pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#define pte_unmap(pte) kunmap_atomic((pte))
#else
#define pte_offset_map(dir, address) \
((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
#define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
#endif
/* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62b..f86da20347f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
@@ -125,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))/* NOP */
-#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
#define update_mmu_cache(vma, address, ptep) do { } while (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbeba..cae9c3cb95c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
u16 phys_proc_id;
/* Core id: */
u16 cpu_core_id;
+ /* Compute unit id */
+ u8 compute_unit_id;
/* Index into per_cpu list: */
u16 cpu_index;
#endif
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features;
static inline void set_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features |= mask;
cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
static inline void clear_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features &= ~mask;
cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long idle_halt;
extern unsigned long idle_nomwait;
extern bool c1e_detected;
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt. Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions. wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
- mb();
- /* check for clflush to determine if wbinvd is legal */
- if (cpu_has_clflush)
- asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
- else
- while (1)
- halt();
-}
-
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6..7f7e577a0e3 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f..231f1c1d660 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -73,31 +73,31 @@
#define GDT_ENTRY_DEFAULT_USER_DS 15
-#define GDT_ENTRY_KERNEL_BASE 12
+#define GDT_ENTRY_KERNEL_BASE (12)
-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0)
-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1)
-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5)
-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11)
-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
#ifdef CONFIG_SMP
#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
#else
#define __KERNEL_PERCPU 0
#endif
-#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
+#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
#ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
+#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
#else
#define __KERNEL_STACK_CANARY 0
#endif
@@ -182,10 +182,10 @@
#endif
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d7..d6763b139a8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
: : "i" (sz)); \
}
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries) \
+ type *name; \
+ RESERVE_BRK(name, sizeof(type) * entries)
+
#ifdef __i386__
void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4cfc9082406..4c2f63c7fc1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -50,7 +50,7 @@ struct smp_ops {
void (*smp_prepare_cpus)(unsigned max_cpus);
void (*smp_cpus_done)(unsigned max_cpus);
- void (*smp_send_stop)(void);
+ void (*stop_other_cpus)(int wait);
void (*smp_send_reschedule)(int cpu);
int (*cpu_up)(unsigned cpu);
@@ -73,7 +73,12 @@ extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
{
- smp_ops.smp_send_stop();
+ smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+ smp_ops.stop_other_cpus(1);
}
static inline void smp_prepare_boot_cpu(void)
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 8085277e1b8..977f1761a25 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,17 +5,26 @@
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
-extern int __init pci_swiotlb_detect(void);
+extern int __init pci_swiotlb_detect_override(void);
+extern int __init pci_swiotlb_detect_4gb(void);
extern void __init pci_swiotlb_init(void);
+extern void __init pci_swiotlb_late_init(void);
#else
#define swiotlb 0
-static inline int pci_swiotlb_detect(void)
+static inline int pci_swiotlb_detect_override(void)
+{
+ return 0;
+}
+static inline int pci_swiotlb_detect_4gb(void)
{
return 0;
}
static inline void pci_swiotlb_init(void)
{
}
+static inline void pci_swiotlb_late_init(void)
+{
+}
#endif
static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7d..169be8938b9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
-extern void zap_low_mappings(bool early);
-
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 4dde797c057..f4500fb3b48 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,16 +13,13 @@ extern unsigned char *trampoline_base;
extern unsigned long init_rsp;
extern unsigned long initial_code;
-extern unsigned long initial_page_table;
extern unsigned long initial_gs;
#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
extern unsigned long setup_trampoline(void);
-extern void __init setup_trampoline_page_table(void);
extern void __init reserve_trampoline_memory(void);
#else
-static inline void setup_trampoline_page_table(void) {}
static inline void reserve_trampoline_memory(void) {}
#endif /* CONFIG_X86_TRAMPOLINE */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index bf6b88ef8ee..e969f691cbf 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,7 @@
*
* SGI UV architectural definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_HUB_H
@@ -77,7 +77,8 @@
*
* 1111110000000000
* 5432109876543210
- * pppppppppplc0cch
+ * pppppppppplc0cch Nehalem-EX
+ * ppppppppplcc0cch Westmere-EX
* sssssssssss
*
* p = pnode bits
@@ -148,12 +149,25 @@ struct uv_hub_info_s {
unsigned char m_val;
unsigned char n_val;
struct uv_scir_s scir;
+ unsigned char apic_pnode_shift;
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+union uvh_apicid {
+ unsigned long v;
+ struct uvh_apicid_s {
+ unsigned long local_apic_mask : 24;
+ unsigned long local_apic_shift : 5;
+ unsigned long unused1 : 3;
+ unsigned long pnode_mask : 24;
+ unsigned long pnode_shift : 5;
+ unsigned long unused2 : 3;
+ } s;
+};
+
/*
* Local & Global MMR space macros.
* Note: macros are intended to be used ONLY by inline functions
@@ -182,6 +196,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
(((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+#define UVH_APICID 0x002D0E00L
#define UV_APIC_PNODE_SHIFT 6
/* Local Bus from cpu's perspective */
@@ -280,7 +295,7 @@ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
*/
static inline int uv_apicid_to_pnode(int apicid)
{
- return (apicid >> UV_APIC_PNODE_SHIFT);
+ return (apicid >> uv_hub_info->apic_pnode_shift);
}
/*
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index b2f2d2e05ce..6d90adf4428 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -806,6 +806,78 @@ union uvh_node_present_table_u {
};
/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
/* ========================================================================= */
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
@@ -857,6 +929,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
};
/* ========================================================================= */
+/* UVH_RH_GAM_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+union uvh_rh_gam_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_config_mmr_s {
+ unsigned long m_skt : 6; /* RW */
+ unsigned long n_skt : 4; /* RW */
+ unsigned long rsvd_10_11: 2; /* */
+ unsigned long mmiol_cfg : 1; /* RW */
+ unsigned long rsvd_13_63: 51; /* */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
@@ -987,97 +1082,5 @@ union uvh_rtc1_int_config_u {
} s;
};
-/* ========================================================================= */
-/* UVH_SI_ADDR_MAP_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
-
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
-
-union uvh_si_addr_map_config_u {
- unsigned long v;
- struct uvh_si_addr_map_config_s {
- unsigned long m_skt : 6; /* RW */
- unsigned long rsvd_6_7: 2; /* */
- unsigned long n_skt : 4; /* RW */
- unsigned long rsvd_12_63: 52; /* */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
-
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias0_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias0_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
-
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias1_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias1_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
-
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias2_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias2_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-#endif /* _ASM_X86_UV_UV_MMRS_H */
+#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a290..00000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE 0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR 3
-#define VMI_API_REV_MINOR 0
-
-#define VMI_CALL_CPUID 0
-#define VMI_CALL_WRMSR 1
-#define VMI_CALL_RDMSR 2
-#define VMI_CALL_SetGDT 3
-#define VMI_CALL_SetLDT 4
-#define VMI_CALL_SetIDT 5
-#define VMI_CALL_SetTR 6
-#define VMI_CALL_GetGDT 7
-#define VMI_CALL_GetLDT 8
-#define VMI_CALL_GetIDT 9
-#define VMI_CALL_GetTR 10
-#define VMI_CALL_WriteGDTEntry 11
-#define VMI_CALL_WriteLDTEntry 12
-#define VMI_CALL_WriteIDTEntry 13
-#define VMI_CALL_UpdateKernelStack 14
-#define VMI_CALL_SetCR0 15
-#define VMI_CALL_SetCR2 16
-#define VMI_CALL_SetCR3 17
-#define VMI_CALL_SetCR4 18
-#define VMI_CALL_GetCR0 19
-#define VMI_CALL_GetCR2 20
-#define VMI_CALL_GetCR3 21
-#define VMI_CALL_GetCR4 22
-#define VMI_CALL_WBINVD 23
-#define VMI_CALL_SetDR 24
-#define VMI_CALL_GetDR 25
-#define VMI_CALL_RDPMC 26
-#define VMI_CALL_RDTSC 27
-#define VMI_CALL_CLTS 28
-#define VMI_CALL_EnableInterrupts 29
-#define VMI_CALL_DisableInterrupts 30
-#define VMI_CALL_GetInterruptMask 31
-#define VMI_CALL_SetInterruptMask 32
-#define VMI_CALL_IRET 33
-#define VMI_CALL_SYSEXIT 34
-#define VMI_CALL_Halt 35
-#define VMI_CALL_Reboot 36
-#define VMI_CALL_Shutdown 37
-#define VMI_CALL_SetPxE 38
-#define VMI_CALL_SetPxELong 39
-#define VMI_CALL_UpdatePxE 40
-#define VMI_CALL_UpdatePxELong 41
-#define VMI_CALL_MachineToPhysical 42
-#define VMI_CALL_PhysicalToMachine 43
-#define VMI_CALL_AllocatePage 44
-#define VMI_CALL_ReleasePage 45
-#define VMI_CALL_InvalPage 46
-#define VMI_CALL_FlushTLB 47
-#define VMI_CALL_SetLinearMapping 48
-
-#define VMI_CALL_SetIOPLMask 61
-#define VMI_CALL_SetInitialAPState 62
-#define VMI_CALL_APICWrite 63
-#define VMI_CALL_APICRead 64
-#define VMI_CALL_IODelay 65
-#define VMI_CALL_SetLazyMode 73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
-#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT 0x01
-#define VMI_PAGE_PD 0x02
-#define VMI_PAGE_PDP 0x04
-#define VMI_PAGE_PML4 0x08
-
-#define VMI_PAGE_NORMAL 0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK 0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB 0x01
-#define VMI_FLUSH_GLOBAL 0x02
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE 0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP 3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
- unsigned char *eip;
- unsigned char type;
- unsigned char reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- * Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
- u16 rom_signature; /* option ROM signature */
- u8 rom_length; /* ROM length in 512 byte chunks */
- u8 rom_entry[4]; /* 16-bit code entry point */
- u8 rom_pad0; /* 4-byte align pad */
- u32 vrom_signature; /* VROM identification signature */
- u8 api_version_min;/* Minor version of API */
- u8 api_version_maj;/* Major version of API */
- u8 jump_slots; /* Number of jump slots */
- u8 reserved1; /* Reserved for expansion */
- u32 virtual_top; /* Hypervisor virtual address start */
- u16 reserved2; /* Reserved for expansion */
- u16 license_offs; /* Offset to License string */
- u16 pci_header_offs;/* Offset to PCI OPROM header */
- u16 pnp_header_offs;/* Offset to PnP OPROM header */
- u32 rom_pad3; /* PnP reserverd / VMI reserved */
- u8 reserved[96]; /* Reserved for headers */
- char vmi_init[8]; /* VMI_Init jump point */
- char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
- char sig[4];
- char rev;
- char size;
- short next;
- short res;
- long devID;
- unsigned short manufacturer_offset;
- unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
- char sig[4];
- short vendorID;
- short deviceID;
- short vpdData;
- short size;
- char rev;
- char class;
- char subclass;
- char interface;
- short chunks;
- char rom_version_min;
- char rom_version_maj;
- char codetype;
- char lastRom;
- short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
- u32 cr0;
- u32 cr2;
- u32 cr3;
- u32 cr4;
-
- u64 efer;
-
- u32 eip;
- u32 eflags;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esp;
- u32 ebp;
- u32 esi;
- u32 edi;
- u16 cs;
- u16 ss;
- u16 ds;
- u16 es;
- u16 fs;
- u16 gs;
- u16 ldtr;
-
- u16 gdtr_limit;
- u32 gdtr_base;
- u32 idtr_base;
- u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3..00000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency 66
-#define VMI_CALL_GetCycleCounter 67
-#define VMI_CALL_SetAlarm 68
-#define VMI_CALL_CancelAlarm 69
-#define VMI_CALL_GetWallclockTime 70
-#define VMI_CALL_WallclockUpdated 71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
- u64 (*get_cycle_frequency)(void);
- u64 (*get_cycle_counter)(int);
- u64 (*get_wallclock)(void);
- int (*wallclock_updated)(void);
- void (*set_alarm)(u32 flags, u64 expiry, u64 period);
- void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready. The vcpu is in the 'running' state if it
- * is executing. When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it). Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'. Stolen time is the time in which the vcpu is in the
- * 'ready' state. Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL 0
-#define VMI_CYCLES_AVAILABLE 1
-#define VMI_CYCLES_STOLEN 2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS 2
-
-#define VMI_ALARM_COUNTER_MASK 0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-
-#define VMI_ALARM_IS_ONESHOT 0x00000000
-#define VMI_ALARM_IS_PERIODIC 0x00000100
-
-#define CONFIG_VMI_ALARM_HZ 100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index baa579c8e03..64642ad019f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -154,9 +154,18 @@ struct x86_platform_ops {
int (*i8042_detect)(void);
};
+struct pci_dev;
+
+struct x86_msi_ops {
+ int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
+ void (*teardown_msi_irq)(unsigned int irq);
+ void (*teardown_msi_irqs)(struct pci_dev *dev);
+};
+
extern struct x86_init_ops x86_init;
extern struct x86_cpuinit_ops x86_cpuinit;
extern struct x86_platform_ops x86_platform;
+extern struct x86_msi_ops x86_msi;
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5e0eb875891..dd8c1414b3d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -37,14 +37,21 @@ typedef struct xpaddr {
extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
+ unsigned long mfn;
+
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+ mfn = get_phys_to_machine(pfn);
+
+ if (mfn != INVALID_P2M_ENTRY)
+ mfn &= ~FOREIGN_FRAME_BIT;
+
+ return mfn;
}
static inline int phys_to_machine_mapping_valid(unsigned long pfn)
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 00000000000..2329b3eaf8d
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_XEN)
+extern int __init pci_xen_init(void);
+extern int __init pci_xen_hvm_init(void);
+#define pci_xen 1
+#else
+#define pci_xen 0
+#define pci_xen_init (0)
+static inline int pci_xen_hvm_init(void)
+{
+ return -1;
+}
+#endif
+#if defined(CONFIG_XEN_DOM0)
+void __init xen_setup_pirqs(void);
+#else
+static inline void __init xen_setup_pirqs(void)
+{
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+ int (*enable_msi)(struct pci_dev *dev, int **vectors);
+ void (*disable_msi)(struct pci_dev *dev);
+ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+ void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+ int **vectors)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+ return xen_pci_frontend->enable_msi(dev, vectors);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+ xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+ int **vectors, int nvec)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+ xen_pci_frontend->disable_msix(dev);
+}
+#endif /* CONFIG_PCI_XEN */
+#endif /* CONFIG_PCI_MSI */
+
+#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fedf32a8c3e..9e13763b609 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -34,8 +34,8 @@ GCOV_PROFILE_paravirt.o := n
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
-obj-y += setup.o x86_init.o i8259.o irqinit.o
-obj-$(CONFIG_X86_VISWS) += visws_quirks.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
@@ -44,6 +44,7 @@ obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
+obj-y += pci-iommu_table.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
@@ -56,7 +57,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
-obj-$(CONFIG_SFI) += sfi.o
obj-y += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
@@ -80,20 +80,19 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
-obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -102,13 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-obj-$(CONFIG_SCx200) += scx200.o
-scx200-y += scx200_32.o
-
-obj-$(CONFIG_OLPC) += olpc.o
-obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
-obj-$(CONFIG_X86_MRST) += mrst.o
-
microcode-y := microcode_core.o
microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
@@ -121,8 +113,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce..71232b941b6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -513,35 +513,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
return 0;
}
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
-
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (trigger == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
- }
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ eisa_set_level_irq(gsi);
#endif
+ return gsi;
+}
+
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
- }
+ gsi = mp_register_gsi(dev, gsi, trigger, polarity);
#endif
+
+ return gsi;
+}
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ unsigned int irq;
+ unsigned int plat_gsi = gsi;
+
+ plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
irq = gsi_to_irq(plat_gsi);
return irq;
}
+void __init acpi_set_irq_model_pic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+ __acpi_register_gsi = acpi_register_gsi_pic;
+ acpi_ioapic = 0;
+}
+
+void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+
/*
* ACPI based hotplug support for CPU
*/
@@ -1259,8 +1286,7 @@ static void __init acpi_process_madt(void)
*/
error = acpi_parse_madt_ioapic_entries();
if (!error) {
- acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_ioapic = 1;
+ acpi_set_irq_model_ioapic();
smp_found_config = 1;
}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59b..5812404a0d4 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
+#include <asm/mwait.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070..69fd72aa559 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,11 +7,16 @@
#include <linux/acpi.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
#include <asm/desc.h>
+#ifdef CONFIG_X86_32
+#include <asm/pgtable.h>
+#endif
+
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -90,7 +95,7 @@ int acpi_save_state_mem(void)
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ header->pmode_cr3 = (u32)__pa(&initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
@@ -125,7 +130,7 @@ void acpi_restore_state_mem(void)
*/
void __init acpi_reserve_wakeup_memory(void)
{
- unsigned long mem;
+ phys_addr_t mem;
if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
printk(KERN_ERR
@@ -133,15 +138,15 @@ void __init acpi_reserve_wakeup_memory(void)
return;
}
- mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
+ mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
- if (mem == -1L) {
+ if (mem == MEMBLOCK_ERROR) {
printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
return;
}
acpi_realmode = (unsigned long) phys_to_virt(mem);
acpi_wakeup_address = mem;
- reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
+ memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c..5079f24c955 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
@@ -637,7 +637,33 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
tpp.len = len;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
- stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ /* Use __stop_machine() because the caller already got online_cpus. */
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
return addr;
}
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+
+#ifdef CONFIG_X86_64
+unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
+#else
+unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
+#endif
+
+void __init arch_init_ideal_nop5(void)
+{
+ /*
+ * There is no good nop for all x86 archs. This selection
+ * algorithm should be unified with the one in find_nop_table(),
+ * but this should be good enough for now.
+ *
+ * For cases other than the ones below, use the safe (as in
+ * always functional) defaults above.
+ */
+#ifdef CONFIG_X86_64
+ /* Don't use these on 32 bits due to broken virtualizers */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ memcpy(ideal_nop5, p6_nops[5], 5);
+#endif
+}
+#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382..d2fdb0826df 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed..6e11c813415 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -31,7 +31,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/x86_init.h>
-
+#include <asm/iommu_table.h>
/*
* definitions for the ACPI scanning code
*/
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
return 1UL << shift;
}
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+ pci_read_config_dword(iommu->dev, 0xfc, &val);
+ return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+ pci_write_config_dword(iommu->dev, 0xfc, val);
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf0, address);
+ pci_read_config_dword(iommu->dev, 0xf4, &val);
+ return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+ pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
/****************************************************************************
*
* AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
u32 range, misc;
+ int i, j;
pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
&iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
- if (is_rd890_iommu(iommu->dev)) {
- pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
- pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
- pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
- pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
- }
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * Some rd890 systems may not be fully reconfigured by the BIOS, so
+ * it's necessary for us to store this information so it can be
+ * reprogrammed on resume
+ */
+
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ &iommu->stored_addr_lo);
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ &iommu->stored_addr_hi);
+
+ /* Low bit locks writes to configuration space */
+ iommu->stored_addr_lo &= ~1;
+
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+ for (i = 0; i < 0x83; i++)
+ iommu->stored_l2[i] = iommu_read_l2(iommu, i);
}
/*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
}
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
{
- if (is_rd890_iommu(iommu->dev)) {
- pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
- pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
- pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
- pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
- }
+ int i, j;
+ u32 ioc_feature_control;
+ struct pci_dev *pdev = NULL;
+
+ /* RD890 BIOSes may not have completely reconfigured the iommu */
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * First, we need to ensure that the iommu is enabled. This is
+ * controlled by a register in the northbridge
+ */
+ pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+ if (!pdev)
+ return;
+
+ /* Select Northbridge indirect register 0x75 and enable writing */
+ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+ pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+ /* Enable the iommu */
+ if (!(ioc_feature_control & 0x1))
+ pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+ pci_dev_put(pdev);
+
+ /* Restore the iommu BAR */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo);
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ iommu->stored_addr_hi);
+
+ /* Restore the l1 indirect regs for each of the 6 l1s */
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+ /* Restore the l2 indirect regs */
+ for (i = 0; i < 0x83; i++)
+ iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+ /* Lock PCI setup registers */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo | 1);
}
/*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable(iommu);
- iommu_apply_quirks(iommu);
iommu_init_flags(iommu);
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_apply_resume_quirks(iommu);
+
/* re-load the hardware */
enable_iommus();
@@ -1405,13 +1499,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
return 0;
}
-void __init amd_iommu_detect(void)
+int __init amd_iommu_detect(void)
{
if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return;
+ return -ENODEV;
if (amd_iommu_disabled)
- return;
+ return -ENODEV;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
@@ -1420,7 +1514,9 @@ void __init amd_iommu_detect(void)
/* Make sure ACS will be enabled */
pci_request_acs();
+ return 1;
}
+ return -ENODEV;
}
/****************************************************************************
@@ -1451,3 +1547,8 @@ static int __init parse_amd_iommu_options(char *str)
__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+ gart_iommu_hole_init,
+ 0,
+ 0);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
index 0f7bc20cfcd..8f6463d8ed0 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
+#include <asm/amd_nb.h>
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
-struct pci_dev **k8_northbridges;
+struct k8_northbridge_info k8_northbridges;
EXPORT_SYMBOL(k8_northbridges);
static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
int i;
struct pci_dev *dev;
- if (num_k8_northbridges)
+ if (k8_northbridges.num)
return 0;
dev = NULL;
while ((dev = next_k8_northbridge(dev)) != NULL)
- num_k8_northbridges++;
+ k8_northbridges.num++;
+
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ k8_northbridges.gart_supported = 1;
- k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
- GFP_KERNEL);
- if (!k8_northbridges)
+ k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+ sizeof(void *), GFP_KERNEL);
+ if (!k8_northbridges.nb_misc)
return -ENOMEM;
- if (!num_k8_northbridges) {
- k8_northbridges[0] = NULL;
+ if (!k8_northbridges.num) {
+ k8_northbridges.nb_misc[0] = NULL;
return 0;
}
- flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges);
- return -ENOMEM;
+ if (k8_northbridges.gart_supported) {
+ flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+ GFP_KERNEL);
+ if (!flush_words) {
+ kfree(k8_northbridges.nb_misc);
+ return -ENOMEM;
+ }
}
dev = NULL;
i = 0;
while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges[i] = dev;
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+ k8_northbridges.nb_misc[i] = dev;
+ if (k8_northbridges.gart_supported)
+ pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
}
- k8_northbridges[i] = NULL;
+ k8_northbridges.nb_misc[i] = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
+ if (!k8_northbridges.gart_supported)
+ return;
+
/* Avoid races between AGP and IOMMU. In theory it's not needed
but I'm not sure if the hardware won't lose flush requests
when another is pending. This whole thing is so expensive anyways
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- pci_write_config_dword(k8_northbridges[i], 0x9c,
+ for (i = 0; i < k8_northbridges.num; i++) {
+ pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
flush_words[i]|1);
flushed++;
}
- for (i = 0; i < num_k8_northbridges; i++) {
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
- pci_read_config_dword(k8_northbridges[i],
+ pci_read_config_dword(k8_northbridges.nb_misc[i],
0x9c, &w);
if (!(w & 1))
break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5..92543c73cf8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
apbt_start_counter(phy_cs_timer_id);
}
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
- struct irq_chip *chip;
- struct irq_desc *desc;
-
- /* timer0 irq has been setup early */
- if (adev->irq == 0)
- return;
- desc = irq_to_desc(adev->irq);
- chip = get_irq_chip(adev->irq);
- disable_irq(adev->irq);
- desc->status |= IRQ_MOVE_PCNTXT;
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
- set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
- enable_irq(adev->irq);
- if (system_state == SYSTEM_BOOTING)
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
-}
-#endif
-
static void apbt_enable_int(int n)
{
unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void)
}
#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+
+ if (system_state == SYSTEM_BOOTING) {
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+ if (request_irq(adev->irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ adev->name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ adev->num);
+ }
+ } else
+ enable_irq(adev->irq);
+}
+
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
@@ -343,7 +336,7 @@ void apbt_setup_secondary_clock(void)
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
- if (cpu == boot_cpu_id)
+ if (!cpu)
return;
/*
* We need to calculate the scaled math multiplication factor for
@@ -389,16 +382,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_DEAD:
+ disable_irq(adev->irq);
apbt_disable_int(cpu);
- if (system_state == SYSTEM_RUNNING)
+ if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
- else if (adev) {
+ } else if (adev) {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
free_irq(adev->irq, adev);
}
break;
default:
- pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+ pr_debug("APBT notified %lu, no action\n", action);
}
return NOTIFY_OK;
}
@@ -552,7 +546,7 @@ bad_count:
pr_debug("APB CS going back %lx:%lx:%lx ",
t2, last_read, t2 - last_read);
bad_count_x3:
- pr_debug(KERN_INFO "tripple check enforced\n");
+ pr_debug("triple check enforced\n");
t0 = apbt_readl(phy_cs_timer_id,
APBTMR_N_CURRENT_VALUE);
udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e1..b3a16e8f070 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/x86_init.h>
int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void)
static int __initdata printed_gart_size_msg;
-void __init gart_iommu_hole_init(void)
+int __init gart_iommu_hole_init(void)
{
u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void)
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
- return;
+ return -ENODEV;
printk(KERN_INFO "Checking aperture...\n");
@@ -463,8 +463,9 @@ out:
unsigned long n = (32 * 1024 * 1024) << last_aper_order;
insert_aperture_resource((u32)last_aper_base, n);
+ return 1;
}
- return;
+ return 0;
}
if (!fallback_aper_force) {
@@ -500,13 +501,18 @@ out:
panic("Not enough memory for aperture");
}
} else {
- return;
+ return 0;
}
/* Fix up the north bridges */
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = DISTLBWALKPRB | aper_order << 1;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -515,13 +521,12 @@ out:
if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
set_up_gart_resume(aper_order, aper_alloc);
+
+ return 1;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49..3f838d53739 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -370,38 +370,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
*
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
*
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
*/
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
}
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
+ unsigned int rsvd; /* 0: uninitialized */
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ do {
+ if (rsvd &&
+ !eilvt_entry_is_changeable(rsvd, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ return new;
}
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+ "vector 0x%x was already reserved by another core, "
+ "APIC%lX=0x%x\n",
+ smp_processor_id(), new, reserved, reg, old);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+ "register already in use, APIC%lX=0x%x\n",
+ smp_processor_id(), new, reg, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
/*
* Program the next event, relative to now
@@ -1665,10 +1714,7 @@ int __init APIC_init_uniprocessor(void)
}
#endif
-#ifndef CONFIG_SMP
- enable_IR_x2apic();
default_setup_apic_routing();
-#endif
verify_local_APIC();
connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb..7cc0a721f62 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
{
- struct irq_pin_list *pin;
-
- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
- return pin;
+ return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -150,10 +146,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- int count;
- int node;
- int i;
+ int count, node, i;
if (!legacy_pic->nr_legacy_irqs) {
nr_irqs_gsi = 0;
@@ -162,13 +155,15 @@ int __init arch_early_irq_init(void)
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
- node= cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
+
+ /* Make sure the legacy interrupts are marked in the bitmap */
+ irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
for (i = 0; i < count; i++) {
- desc = irq_to_desc(i);
- desc->chip_data = &cfg[i];
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+ set_irq_chip_data(i, &cfg[i]);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
/*
* For legacy IRQ's, start with assigning irq0 to irq15 to
* IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +178,88 @@ int __init arch_early_irq_init(void)
}
#ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
{
- struct irq_cfg *cfg = NULL;
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
- if (desc)
- cfg = desc->chip_data;
-
- return cfg;
+ return get_irq_chip_data(irq);
}
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
{
struct irq_cfg *cfg;
- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
- if (cfg) {
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
- kfree(cfg);
- cfg = NULL;
- } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
- GFP_ATOMIC, node)) {
- free_cpumask_var(cfg->domain);
- kfree(cfg);
- cfg = NULL;
- }
- }
-
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
}
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
- struct irq_cfg *cfg;
-
- cfg = desc->chip_data;
- if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(node);
- if (!desc->chip_data) {
- printk(KERN_ERR "can not alloc irq_cfg\n");
- BUG_ON(1);
- }
- }
-
- return 0;
-}
-
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
{
- struct irq_pin_list *old_entry, *head, *tail, *entry;
-
- cfg->irq_2_pin = NULL;
- old_entry = old_cfg->irq_2_pin;
- if (!old_entry)
- return;
-
- entry = get_one_free_irq_2_pin(node);
- if (!entry)
+ if (!cfg)
return;
+ set_irq_chip_data(at, NULL);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
+}
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- head = entry;
- tail = entry;
- old_entry = old_entry->next;
- while (old_entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- entry = head;
- while (entry) {
- head = entry->next;
- kfree(entry);
- entry = head;
- }
- /* still use the old one */
- return;
- }
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- tail->next = entry;
- tail = entry;
- old_entry = old_entry->next;
- }
+#else
- tail->next = NULL;
- cfg->irq_2_pin = head;
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
{
- struct irq_pin_list *entry, *next;
-
- if (old_cfg->irq_2_pin == cfg->irq_2_pin)
- return;
+ return irq_cfgx + irq;
+}
- entry = old_cfg->irq_2_pin;
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
- while (entry) {
- next = entry->next;
- kfree(entry);
- entry = next;
- }
- old_cfg->irq_2_pin = NULL;
-}
+#endif
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
{
+ int res = irq_alloc_desc_at(at, node);
struct irq_cfg *cfg;
- struct irq_cfg *old_cfg;
-
- cfg = get_one_free_irq_cfg(node);
- if (!cfg)
- return;
-
- desc->chip_data = cfg;
-
- old_cfg = old_desc->chip_data;
-
- cfg->vector = old_cfg->vector;
- cfg->move_in_progress = old_cfg->move_in_progress;
- cpumask_copy(cfg->domain, old_cfg->domain);
- cpumask_copy(cfg->old_domain, old_cfg->old_domain);
-
- init_copy_irq_2_pin(old_cfg, cfg, node);
-}
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = get_irq_chip_data(at);
+ if (cfg)
+ return cfg;
+ }
-static void free_irq_cfg(struct irq_cfg *cfg)
-{
- free_cpumask_var(cfg->domain);
- free_cpumask_var(cfg->old_domain);
- kfree(cfg);
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ set_irq_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
}
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+static int alloc_irq_from(unsigned int from, int node)
{
- struct irq_cfg *old_cfg, *cfg;
-
- old_cfg = old_desc->chip_data;
- cfg = desc->chip_data;
-
- if (old_cfg == cfg)
- return;
-
- if (old_cfg) {
- free_irq_2_pin(old_cfg, cfg);
- free_irq_cfg(old_cfg);
- old_desc->chip_data = NULL;
- }
+ return irq_alloc_desc_from(from, node);
}
-/* end for move_irq_desc */
-#else
-struct irq_cfg *irq_cfg(unsigned int irq)
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ free_irq_cfg(at, cfg);
+ irq_free_desc(at);
}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -451,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +394,7 @@ static void ioapic_mask_entry(int apic, int pin)
* fast in the common case, and fast for shared ISA-space IRQs.
*/
static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list **last, *entry;
@@ -493,7 +406,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
last = &entry->next;
}
- entry = get_one_free_irq_2_pin(node);
+ entry = alloc_irq_pin_list(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
node, apic, pin);
@@ -508,7 +421,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
- if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+ if (__add_pin_to_irq_node(cfg, node, apic, pin))
panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
@@ -571,11 +484,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
static void io_apic_sync(struct irq_pin_list *entry)
{
/*
@@ -587,44 +495,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(struct irq_data *data)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- BUG_ON(!cfg);
+ mask_ioapic(data->chip_data);
+}
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic(struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_IO_APIC_irq(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(data->chip_data);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -694,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
struct IO_APIC_route_entry **ioapic_entries;
ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_ATOMIC);
+ GFP_KERNEL);
if (!ioapic_entries)
return 0;
for (apic = 0; apic < nr_ioapics; apic++) {
ioapic_entries[apic] =
kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_ATOMIC);
+ nr_ioapic_registers[apic], GFP_KERNEL);
if (!ioapic_entries[apic])
goto nomem;
}
@@ -1259,7 +1160,6 @@ void __setup_vector_irq(int cpu)
/* Initialize vector_irq on a new cpu */
int irq, vector;
struct irq_cfg *cfg;
- struct irq_desc *desc;
/*
* vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1168,10 @@ void __setup_vector_irq(int cpu)
*/
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
-
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
+ if (!cfg)
+ continue;
/*
* If it is a legacy IRQ handled by the legacy PIC, this cpu
* will be part of the irq_cfg's domain.
@@ -1327,17 +1228,17 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
{
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
- desc->status |= IRQ_LEVEL;
+ irq_set_status_flags(irq, IRQ_LEVEL);
else
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
- if (irq_remapped(irq)) {
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
if (trigger)
set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
handle_fasteoi_irq,
@@ -1358,10 +1259,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
}
-int setup_ioapic_entry(int apic_id, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin)
+static int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
@@ -1382,21 +1283,7 @@ int setup_ioapic_entry(int apic_id, int irq,
if (index < 0)
panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
- memset(&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- /*
- * Trigger mode in the IRTE will always be edge, and the
- * actual level or edge trigger will be setup in the IO-APIC
- * RTE. This will help simplify level triggered irq migration.
- * For more details, see the comments above explainig IO-APIC
- * irq migration in the presence of interrupt-remapping.
- */
- irte.trigger_mode = 0;
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = vector;
- irte.dest_id = IRTE_DEST(destination);
+ prepare_irte(&irte, vector, destination);
/* Set source-id of interrupt request */
set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1318,14 @@ int setup_ioapic_entry(int apic_id, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
+ struct irq_cfg *cfg, int trigger, int polarity)
{
- struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
-
- cfg = desc->chip_data;
-
/*
* For legacy irqs, cfg->domain starts with cpu 0 for legacy
* controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1471,9 +1354,9 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
return;
}
- ioapic_register_intr(irq, desc, trigger);
+ ioapic_register_intr(irq, trigger);
if (irq < legacy_pic->nr_legacy_irqs)
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
ioapic_write_entry(apic_id, pin, entry);
}
@@ -1484,11 +1367,9 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
- int notcon = 0;
- struct irq_desc *desc;
+ int apic_id, pin, idx, irq, notcon = 0;
+ int node = cpu_to_node(0);
struct irq_cfg *cfg;
- int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1525,19 +1406,17 @@ static void __init setup_IO_APIC_irqs(void)
apic->multi_timer_check(apic_id, irq))
continue;
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
continue;
- }
- cfg = desc->chip_data;
+
add_pin_to_irq_node(cfg, node, apic_id, pin);
/*
* don't mark it in pin_programmed, so later acpi could
* set it correctly when irq < 16
*/
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
+ irq_polarity(idx));
}
if (notcon)
@@ -1552,9 +1431,7 @@ static void __init setup_IO_APIC_irqs(void)
*/
void setup_IO_APIC_irq_extra(u32 gsi)
{
- int apic_id = 0, pin, idx, irq;
- int node = cpu_to_node(boot_cpu_id);
- struct irq_desc *desc;
+ int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
struct irq_cfg *cfg;
/*
@@ -1570,18 +1447,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
return;
irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
- desc = irq_to_desc(irq);
- if (desc)
+
+ /* Only handle the non legacy irqs on secondary ioapics */
+ if (apic_id == 0 || irq < NR_IRQS_LEGACY)
return;
-#endif
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
return;
- }
- cfg = desc->chip_data;
add_pin_to_irq_node(cfg, node, apic_id, pin);
if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1591,7 +1465,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
}
set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ setup_ioapic_irq(apic_id, pin, irq, cfg,
irq_trigger(idx), irq_polarity(idx));
}
@@ -1642,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
- struct irq_desc *desc;
unsigned int irq;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1729,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_desc(irq, desc) {
+ for_each_active_irq(irq) {
struct irq_pin_list *entry;
- cfg = desc->chip_data;
+ cfg = get_irq_chip_data(irq);
if (!cfg)
continue;
entry = cfg->irq_2_pin;
@@ -2239,29 +2112,26 @@ static int __init timer_irq_works(void)
* an edge even if it isn't on the 8259A...
*/
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
{
- int was_pending = 0;
+ int was_pending = 0, irq = data->irq;
unsigned long flags;
- struct irq_cfg *cfg;
raw_spin_lock_irqsave(&ioapic_lock, flags);
if (irq < legacy_pic->nr_legacy_irqs) {
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- cfg = irq_cfg(irq);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(data->chip_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
{
-
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_cfg *cfg = data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2182,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
- if (!irq_remapped(irq))
+ if (!irq_remapped(cfg))
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2192,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
/*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
* ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
*/
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
- unsigned int *dest_id)
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
{
- struct irq_cfg *cfg;
- unsigned int irq;
+ struct irq_cfg *cfg = data->chip_data;
if (!cpumask_intersects(mask, cpu_online_mask))
return -1;
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
+ if (assign_irq_vector(data->irq, data->chip_data, mask))
return -1;
- cpumask_copy(desc->affinity, mask);
+ cpumask_copy(data->affinity, mask);
- *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+ *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
return 0;
}
static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ unsigned int dest, irq = data->irq;
unsigned long flags;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
+ int ret;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = set_desc_affinity(desc, mask, &dest);
+ ret = __ioapic_set_affinity(data, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return ret;
}
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
#ifdef CONFIG_INTR_REMAP
/*
@@ -2395,24 +2246,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
* the interrupt-remapping table entry.
*/
static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return ret;
+ return -EINVAL;
- irq = desc->irq;
if (get_irte(irq, &irte))
- return ret;
+ return -EBUSY;
- cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return ret;
+ return -EBUSY;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2427,29 +2275,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
- cpumask_copy(desc->affinity, mask);
-
+ cpumask_copy(data->affinity, mask);
return 0;
}
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
- const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
#else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
return 0;
}
@@ -2511,10 +2344,8 @@ unlock:
irq_exit();
}
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
- struct irq_desc *desc = *descp;
- struct irq_cfg *cfg = desc->chip_data;
unsigned me;
if (likely(!cfg->move_in_progress))
@@ -2526,31 +2357,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
send_cleanup_vector(cfg);
}
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
{
- __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}
void irq_force_complete_move(int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
if (!cfg)
return;
- __irq_complete_move(&desc, cfg->vector);
+ __irq_complete_move(cfg, cfg->vector);
}
#else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
#endif
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- irq_complete_move(&desc);
- move_native_irq(irq);
+ irq_complete_move(data->chip_data);
+ move_native_irq(data->irq);
ack_APIC_irq();
}
@@ -2572,10 +2400,12 @@ atomic_t irq_mis_count;
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
if (mp_ioapics[entry->apic].apicver >= 0x20) {
/*
@@ -2584,7 +2414,7 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
* intr-remapping table entry. Hence for the io-apic
* EOI we use the pin number.
*/
- if (irq_remapped(irq))
+ if (irq_remapped(cfg))
io_apic_eoi(entry->apic, entry->pin);
else
io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2423,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
__unmask_and_level_IO_APIC_irq(entry);
}
}
-}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, do_unmask_irq = 0, irq = data->irq;
struct irq_desc *desc = irq_to_desc(irq);
unsigned long v;
- int i;
- struct irq_cfg *cfg;
- int do_unmask_irq = 0;
- irq_complete_move(&desc);
+ irq_complete_move(cfg);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq_desc(desc);
+ mask_ioapic(cfg);
}
#endif
@@ -2658,7 +2474,6 @@ static void ack_apic_level(unsigned int irq)
* we use the above logic (mask+edge followed by unmask+level) from
* Manfred Spraul to clear the remote IRR.
*/
- cfg = desc->chip_data;
i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
@@ -2678,7 +2493,7 @@ static void ack_apic_level(unsigned int irq)
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(irq, cfg);
}
/* Now we can move and renable the irq */
@@ -2709,61 +2524,57 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- cfg = desc->chip_data;
if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
}
#ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
{
ack_APIC_irq();
}
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
ack_APIC_irq();
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(data->irq, data->chip_data);
}
#endif /* CONFIG_INTR_REMAP */
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = ack_apic_edge,
+ .irq_eoi = ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .irq_set_affinity = ioapic_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
- .eoi = ir_ack_apic_level,
+ .irq_ack = ir_ack_apic_edge,
+ .irq_eoi = ir_ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
+ .irq_set_affinity = ir_ioapic_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static inline void init_IO_APIC_traps(void)
{
- int irq;
- struct irq_desc *desc;
struct irq_cfg *cfg;
+ unsigned int irq;
/*
* NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
@@ -2788,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
+ set_irq_chip(irq, &no_irq_chip);
}
}
}
@@ -2797,7 +2608,7 @@ static inline void init_IO_APIC_traps(void)
* The local APIC irq-chip implementation:
*/
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2805,7 +2616,7 @@ static void mask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2813,21 +2624,21 @@ static void unmask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
{
ack_APIC_irq();
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
{
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -2930,9 +2741,8 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_desc *desc = irq_to_desc(0);
- struct irq_cfg *cfg = desc->chip_data;
- int node = cpu_to_node(boot_cpu_id);
+ struct irq_cfg *cfg = get_irq_chip_data(0);
+ int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2942,7 +2752,7 @@ static inline void __init check_timer(void)
/*
* get/set the timer IRQ vector:
*/
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
assign_irq_vector(0, cfg, apic->target_cpus());
/*
@@ -3001,7 +2811,7 @@ static inline void __init check_timer(void)
add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
- /* for edge trigger, setup_IO_APIC_irq already
+ /* for edge trigger, setup_ioapic_irq already
* leave it unmasked.
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
@@ -3009,12 +2819,12 @@ static inline void __init check_timer(void)
int idx;
idx = find_irq_entry(apic1, pin1, mp_INT);
if (idx != -1 && irq_trigger(idx))
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -3037,14 +2847,14 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
goto out;
}
@@ -3052,7 +2862,7 @@ static inline void __init check_timer(void)
* Cleanup, just in case ...
*/
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
@@ -3069,16 +2879,16 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0, desc);
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3244,49 +3054,42 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
+ struct irq_cfg *cfg;
unsigned long flags;
- struct irq_cfg *cfg_new = NULL;
- struct irq_desc *desc_new = NULL;
-
- irq = 0;
- if (irq_want < nr_irqs_gsi)
- irq_want = nr_irqs_gsi;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_node(new, node);
- if (!desc_new) {
- printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
- }
- cfg_new = desc_new->chip_data;
-
- if (cfg_new->vector != 0)
- continue;
+ unsigned int ret = 0;
+ int irq;
- desc_new = move_irq_desc(desc_new, node);
- cfg_new = desc_new->chip_data;
+ if (from < nr_irqs_gsi)
+ from = nr_irqs_gsi;
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
+ irq = alloc_irq_from(from, node);
+ if (irq < 0)
+ return 0;
+ cfg = alloc_irq_cfg(irq, node);
+ if (!cfg) {
+ free_irq_at(irq, NULL);
+ return 0;
}
- raw_spin_unlock_irqrestore(&vector_lock, flags);
- if (irq > 0)
- dynamic_irq_init_keep_chip_data(irq);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ ret = irq;
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
- return irq;
+ if (ret) {
+ set_irq_chip_data(irq, cfg);
+ irq_clear_status_flags(irq, IRQ_NOREQUEST);
+ } else {
+ free_irq_at(irq, cfg);
+ }
+ return ret;
}
int create_irq(void)
{
- int node = cpu_to_node(boot_cpu_id);
+ int node = cpu_to_node(0);
unsigned int irq_want;
int irq;
@@ -3301,14 +3104,17 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
unsigned long flags;
- dynamic_irq_cleanup_keep_chip_data(irq);
+ irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
- free_irte(irq);
+ if (irq_remapped(cfg))
+ free_irte(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, get_irq_chip_data(irq));
+ __clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_irq_at(irq, cfg);
}
/*
@@ -3332,7 +3138,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
- if (irq_remapped(irq)) {
+ if (irq_remapped(get_irq_chip_data(irq))) {
struct irte irte;
int ir_index;
u16 sub_handle;
@@ -3340,14 +3146,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
ir_index = map_irq_to_irte_handle(irq, &sub_handle);
BUG_ON(ir_index == -1);
- memset (&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
+ prepare_irte(&irte, cfg->vector, dest);
/* Set source-id of interrupt request */
if (pdev)
@@ -3392,26 +3191,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
}
#ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- get_cached_msi_msg_desc(desc, &msg);
+ __get_cached_msi_msg(data->msi_desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg_desc(desc, &msg);
+ __write_msi_msg(data->msi_desc, &msg);
return 0;
}
@@ -3421,17 +3218,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* done in the process context using interrupt-remapping hardware.
*/
static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
if (get_irte(irq, &irte))
return -1;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
irte.vector = cfg->vector;
@@ -3461,27 +3258,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* which implement the MSI or MSI-X Capability Structure.
*/
static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
+ .irq_set_affinity = msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip msi_ir_chip = {
- .name = "IR-PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
+ .name = "IR-PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
/*
@@ -3513,8 +3310,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
- int ret;
struct msi_msg msg;
+ int ret;
ret = msi_compose_msg(dev, irq, &msg, -1);
if (ret < 0)
@@ -3523,12 +3320,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3538,15 +3331,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
return 0;
}
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- unsigned int irq;
- int ret, sub_handle;
+ int node, ret, sub_handle, index = 0;
+ unsigned int irq, irq_want;
struct msi_desc *msidesc;
- unsigned int irq_want;
struct intel_iommu *iommu = NULL;
- int index = 0;
- int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3599,25 +3389,24 @@ error:
return ret;
}
-void arch_teardown_msi_irq(unsigned int irq)
+void native_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
}
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct msi_msg msg;
- unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
dmar_msi_read(irq, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3633,14 +3422,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
- .ack = ack_apic_edge,
+ .name = "DMAR_MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = dmar_msi_set_affinity,
+ .irq_set_affinity = dmar_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_dmar_msi(unsigned int irq)
@@ -3661,26 +3450,24 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- hpet_msi_read(irq, &msg);
+ hpet_msi_read(data->handler_data, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- hpet_msi_write(irq, &msg);
+ hpet_msi_write(data->handler_data, &msg);
return 0;
}
@@ -3688,34 +3475,33 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip ir_hpet_msi_type = {
- .name = "IR-HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
+ .name = "IR-HPET_MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
- .ack = ack_apic_edge,
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = hpet_msi_set_affinity,
+ .irq_set_affinity = hpet_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
{
- int ret;
struct msi_msg msg;
- struct irq_desc *desc = irq_to_desc(irq);
+ int ret;
if (intr_remapping_enabled) {
struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,9 +3519,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
if (ret < 0)
return ret;
- hpet_msi_write(irq, &msg);
- desc->status |= IRQ_MOVE_PCNTXT;
- if (irq_remapped(irq))
+ hpet_msi_write(get_irq_data(irq), &msg);
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ if (irq_remapped(get_irq_chip_data(irq)))
set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
handle_edge_irq, "edge");
else
@@ -3768,33 +3554,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- target_ht_irq(irq, dest, cfg->vector);
-
+ target_ht_irq(data->irq, dest, cfg->vector);
return 0;
}
#endif
static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-HT",
+ .irq_mask = mask_ht_irq,
+ .irq_unmask = unmask_ht_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
+ .irq_set_affinity = ht_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3867,6 +3650,11 @@ void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+int get_nr_irqs_gsi(void)
+{
+ return nr_irqs_gsi;
+}
+
#ifdef CONFIG_SPARSE_IRQ
int __init arch_probe_nr_irqs(void)
{
@@ -3885,14 +3673,13 @@ int __init arch_probe_nr_irqs(void)
if (nr < nr_irqs)
nr_irqs = nr;
- return 0;
+ return NR_IRQS_LEGACY;
}
#endif
static int __io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr)
{
- struct irq_desc *desc;
struct irq_cfg *cfg;
int node;
int ioapic, pin;
@@ -3908,13 +3695,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
if (dev)
node = dev_to_node(dev);
else
- node = cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
return 0;
- }
pin = irq_attr->ioapic_pin;
trigger = irq_attr->trigger;
@@ -3924,15 +3709,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
* IRQs < 16 are already in the irq_2_pin[] map
*/
if (irq >= legacy_pic->nr_legacy_irqs) {
- cfg = desc->chip_data;
- if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+ if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
pin, irq);
return 0;
}
}
- setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+ setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
return 0;
}
@@ -4125,14 +3909,14 @@ void __init setup_ioapic_dest(void)
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
+ mask = desc->irq_data.affinity;
else
mask = apic->target_cpus();
if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ ir_ioapic_set_affinity(&desc->irq_data, mask, false);
else
- set_ioapic_affinity_irq_desc(desc, mask);
+ ioapic_set_affinity(&desc->irq_data, mask, false);
}
}
@@ -4316,19 +4100,18 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
void __init pre_init_apic_IRQ0(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
printk(KERN_INFO "Early APIC setup for system timer0\n");
#ifndef CONFIG_SMP
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
#endif
- desc = irq_to_desc_alloc_node(0, 0);
+ /* Make sure the irq descriptor is set up */
+ cfg = alloc_irq_and_cfg_at(0, 0);
setup_local_APIC();
- cfg = irq_cfg(0);
add_pin_to_irq_node(cfg, 0, 0, 0);
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
- setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+ setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a43f71cb30f..c90041ccb74 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
error:
if (nmi_watchdog == NMI_IO_APIC) {
if (!timer_through_8259)
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161..960f26ab5c9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
#include <linux/nodemask.h>
#include <linux/topology.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/kernel.h>
@@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
node_end_pfn[node] =
MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
- e820_register_active_regions(node, node_start_pfn[node],
+ memblock_x86_register_active_regions(node, node_start_pfn[node],
node_end_pfn[node]);
memory_present(node, node_start_pfn[node], node_end_pfn[node]);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e..f9e4e6a5407 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
*/
void __init default_setup_apic_routing(void)
{
+
+ enable_IR_x2apic();
+
#ifdef CONFIG_X86_X2APIC
if (x2apic_mode
#ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f744f54cb24..194539aea17 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -41,6 +41,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
static DEFINE_SPINLOCK(uv_nmi_lock);
@@ -70,12 +71,27 @@ static int early_get_nodeid(void)
return node_id.s.node_id;
}
+static void __init early_get_apic_pnode_shift(void)
+{
+ unsigned long *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
+ uvh_apicid.v = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ if (!uvh_apicid.v)
+ /*
+ * Old bios, use default value
+ */
+ uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+}
+
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
int nodeid;
if (!strcmp(oem_id, "SGI")) {
nodeid = early_get_nodeid();
+ early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
if (!strcmp(oem_table_id, "UVL"))
@@ -84,7 +100,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
__get_cpu_var(x2apic_extra_bits) =
- nodeid << (UV_APIC_PNODE_SHIFT - 1);
+ nodeid << (uvh_apicid.s.pnode_shift - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
return 1;
}
@@ -363,14 +379,14 @@ struct redir_addr {
#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
static __initdata struct redir_addr redir_addrs[] = {
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
};
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
- union uvh_si_alias0_overlay_config_u alias;
+ union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
int i;
@@ -644,7 +660,7 @@ void uv_nmi_init(void)
void __init uv_system_init(void)
{
- union uvh_si_addr_map_config_u m_n_config;
+ union uvh_rh_gam_config_mmr_u m_n_config;
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
@@ -654,7 +670,7 @@ void __init uv_system_init(void)
map_low_mmrs();
- m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
mmr_base =
@@ -716,6 +732,10 @@ void __init uv_system_init(void)
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
nid = cpu_to_node(cpu);
+ /*
+ * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
+ */
+ uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b..0e4f24c2a74 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -189,8 +189,8 @@
* Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
*
* [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
* available from Microsoft by calling 206.882.8080.]
*
* APM 1.2 Reference:
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = {
.unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
+ .llseek = noop_llseek,
};
static struct miscdevice apm_device = {
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf640389..1a4088dda37 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -99,9 +99,7 @@ void foo(void)
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
- DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
- DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
- DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+ DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46..13a38917951 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
+
#include <asm/proto.h>
/*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
static unsigned __read_mostly corruption_check_size = 64*1024;
static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
static int num_scan_areas;
-
static __init int set_corruption_check(char *arg)
{
char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+ addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
- if (!(addr + 1))
+ if (addr == MEMBLOCK_ERROR)
break;
if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
if ((addr + size) > corruption_check_size)
size = corruption_check_size - addr;
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+ memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
scan_areas[num_scan_areas].addr = addr;
scan_areas[num_scan_areas].size = size;
num_scan_areas++;
@@ -105,7 +108,6 @@ void __init setup_bios_corruption_check(void)
printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
num_scan_areas);
- update_e820();
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f0..9e093f8fe78 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
#endif
/*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
{
- unsigned long long value;
- u32 nodes, cores_per_node;
+ u32 nodes;
+ u8 node_id;
int cpu = smp_processor_id();
- if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
- return;
+ /* get information required for multi-node processors */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ u32 eax, ebx, ecx, edx;
- /* fixup topology information only once for a core */
- if (cpu_has(c, X86_FEATURE_AMD_DCM))
- return;
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+ nodes = ((ecx >> 8) & 7) + 1;
+ node_id = ecx & 7;
- rdmsrl(MSR_FAM10H_NODE_ID, value);
+ /* get compute unit information */
+ smp_num_siblings = ((ebx >> 8) & 3) + 1;
+ c->compute_unit_id = ebx & 0xff;
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
- nodes = ((value >> 3) & 7) + 1;
- if (nodes == 1)
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes = ((value >> 3) & 7) + 1;
+ node_id = value & 7;
+ } else
return;
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
+ /* fixup multi-node processor information */
+ if (nodes > 1) {
+ u32 cores_per_node;
+
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ cores_per_node = c->x86_max_cores / nodes;
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = value & 7;
+ /* store NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = node_id;
- /* fixup core id to be in range from 0 to (cores_per_node - 1) */
- c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ /* core id to be in range from 0 to (cores_per_node - 1) */
+ c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ }
}
#endif
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- /* fixup topology information on multi-node processors */
- if ((c->x86 == 0x10) && (c->x86_model == 9))
- amd_fixup_dcm(c);
+ amd_get_topology(c);
#endif
}
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* We need to do the following only once */
+ if (c != &boot_cpu_data)
+ return;
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ printk(KERN_WARNING FW_BUG "TSC doesn't count "
+ "with P0 frequency!\n");
+ }
+ }
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
#endif
if (c->extended_cpuid_level >= 0x80000006) {
- if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+ if (cpuid_edx(0x80000006) & 0xf000)
num_cache_leaves = 4;
else
num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25..4b68bda3093 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_early_init(c);
#ifdef CONFIG_SMP
- c->cpu_index = boot_cpu_id;
+ c->cpu_index = 0;
#endif
filter_cpuid_features(c, false);
}
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
}
/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work. Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
* unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
*/
static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+ set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
}
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- /*
- * Force FPU initialization:
- */
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
-
fpu_init();
xsave_init();
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d4..e765633f210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cd8da247dda..a2baafb2fe6 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
per_cpu(acfreq_data, policy->cpu) = NULL;
acpi_processor_unregister_performance(data->acpi_data,
policy->cpu);
+ kfree(data->freq_table);
kfree(data);
}
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 733093d6043..141abebc451 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = {
* Detects nForce2 A2 and C1 stepping
*
*/
-static unsigned int nforce2_detect_chipset(void)
+static int nforce2_detect_chipset(void)
{
nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
PCI_DEVICE_ID_NVIDIA_NFORCE2,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index fc09f142d94..d9f51367666 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq;
* Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
* and MSR_TMTA_LONGRUN_CTRL
*/
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
+static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
{
u32 msr_lo, msr_hi;
@@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu)
* TMTA rules:
* performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
*/
-static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
unsigned int *high_freq)
{
u32 msr_lo, msr_hi;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efb..d16c2c53d6b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -170,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE)
- node = first_node(node_online_map);
- else if (!node_online(node)) {
+ if (node == NUMA_NO_NODE || !node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab8..17ad0336621 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
#include <asm/processor.h>
#include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/smp.h>
#define LVL_1_INST 1
@@ -306,7 +306,7 @@ struct _cache_attr {
ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
};
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
/*
* L3 cache descriptors
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
@@ -369,7 +370,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
return;
/* not in virtualized environments */
- if (num_k8_northbridges == 0)
+ if (k8_northbridges.num == 0)
return;
/*
@@ -377,7 +378,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
* never freed but this is done only on shutdown so it doesn't matter.
*/
if (!l3_caches) {
- int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+ int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
l3_caches = kzalloc(size, GFP_ATOMIC);
if (!l3_caches)
@@ -556,12 +557,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
show_cache_disable_1, store_cache_disable_1);
-#else /* CONFIG_CPU_SUP_AMD */
+#else /* CONFIG_AMD_NB */
static void __cpuinit
amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
{
};
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_AMD_NB */
static int
__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1001,7 @@ static struct attribute *default_attrs[] = {
static struct attribute *default_l3_attrs[] = {
DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
&cache_disable_0.attr,
&cache_disable_1.attr,
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa..1e8d66c1336 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
.release = seq_release,
.read = seq_read,
.write = severities_coverage_write,
+ .llseek = seq_lseek,
};
static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909f..7a35b72d7c0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = {
.read = mce_read,
.poll = mce_poll,
.unlocked_ioctl = mce_ioctl,
+ .llseek = no_llseek,
};
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab..80c482382d5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct thresh_restart tr;
- u8 lvt_off;
+ int lvt_off = -1;
+ u8 offset;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
- lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0);
+ offset = (high & MASK_LVTOFF_HI) >> 20;
+ if (lvt_off < 0) {
+ if (setup_APIC_eilvt(offset,
+ THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0)) {
+ pr_err(FW_BUG "cpu %d, failed to "
+ "setup threshold interrupt "
+ "for bank %d, block %d "
+ "(MSR%08X=0x%x%08x)",
+ smp_processor_id(), bank, block,
+ address, high, low);
+ continue;
+ }
+ lvt_off = offset;
+ } else if (lvt_off != offset) {
+ pr_err(FW_BUG "cpu %d, invalid threshold "
+ "interrupt offset %d for bank %d,"
+ "block %d (MSR%08X=0x%x%08x)",
+ smp_processor_id(), lvt_off, bank,
+ block, address, high, low);
+ continue;
+ }
high &= ~MASK_LVTOFF_HI;
high |= lvt_off << 20;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f..4b683267eca 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
static void unexpected_thermal_interrupt(void)
{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+ printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d07142..ac140c7be39 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d0388..9f27228ceff 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
}
}
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask >>= PAGE_SHIFT;
+ mask |= size_or_mask;
+ size = -mask;
+ size <<= PAGE_SHIFT;
+ return size;
+}
+
/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+ if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+ (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+ *prev = MTRR_TYPE_WRTHROUGH;
+ *curr = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (*prev != *curr) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ * corresponds only to [start:*partial_end].
+ * Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
+ *repeat = 0;
if (!mtrr_state_set)
return 0xFF;
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
+
+ if (start_state != end_state) {
+ /*
+ * We have start:end spanning across an MTRR.
+ * We split the region into
+ * either
+ * (start:mtrr_end) (mtrr_end:end)
+ * or
+ * (start:mtrr_start) (mtrr_start:end)
+ * depending on kind of overlap.
+ * Return the type for first region and a pointer to
+ * the start of second region so that caller will
+ * lookup again on the second region.
+ * Note: This way we handle multiple overlaps as well.
+ */
+ if (start_state)
+ *partial_end = base + get_mtrr_size(mask);
+ else
+ *partial_end = base;
+
+ if (unlikely(*partial_end <= start)) {
+ WARN_ON(1);
+ *partial_end = start + PAGE_SIZE;
+ }
+
+ end = *partial_end - 1; /* end is inclusive */
+ *repeat = 1;
+ }
if ((start & mask) != (base & mask))
continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
continue;
}
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE) {
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
+ if (check_type_overlap(&prev_match, &curr_match))
+ return curr_match;
}
if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
return mtrr_state.def_type;
}
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+ u8 type, prev_type;
+ int repeat;
+ u64 partial_end;
+
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ /*
+ * Common path is with repeat = 0.
+ * However, we can have cases where [start:end] spans across some
+ * MTRR range. Do repeated lookups for that case here.
+ */
+ while (repeat) {
+ prev_type = type;
+ start = partial_end;
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ if (check_type_overlap(&prev_type, &type))
+ return type;
+ }
+
+ return type;
+}
+
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad..ed6310183ef 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -49,7 +49,6 @@ static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
unsigned long size, len = 0;
struct page *page;
void *map;
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
offset = addr & (PAGE_SIZE - 1);
size = min(PAGE_SIZE - offset, n - len);
- map = kmap_atomic(page, type);
+ map = kmap_atomic(page);
memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
+ kunmap_atomic(map);
put_page(page);
len += size;
@@ -238,6 +237,7 @@ struct x86_pmu {
* Intel DebugStore bits
*/
int bts, pebs;
+ int bts_active, pebs_active;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
@@ -381,7 +381,7 @@ static void release_pmc_hardware(void) {}
#endif
-static int reserve_ds_buffers(void);
+static void reserve_ds_buffers(void);
static void release_ds_buffers(void);
static void hw_perf_event_destroy(struct perf_event *event)
@@ -478,7 +478,7 @@ static int x86_setup_perfctr(struct perf_event *event)
if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
(hwc->sample_period == 1)) {
/* BTS is not supported by this architecture. */
- if (!x86_pmu.bts)
+ if (!x86_pmu.bts_active)
return -EOPNOTSUPP;
/* BTS is currently only allowed for user-mode. */
@@ -497,12 +497,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
int precise = 0;
/* Support for constant skid */
- if (x86_pmu.pebs)
+ if (x86_pmu.pebs_active) {
precise++;
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr)
- precise++;
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr)
+ precise++;
+ }
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -531,7 +532,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
/*
* Setup the hardware configuration for a given attr_type
*/
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
{
int err;
@@ -544,11 +545,8 @@ static int __hw_perf_event_init(struct perf_event *event)
if (atomic_read(&active_events) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
- else {
- err = reserve_ds_buffers();
- if (err)
- release_pmc_hardware();
- }
+ else
+ reserve_ds_buffers();
}
if (!err)
atomic_inc(&active_events);
@@ -584,7 +582,7 @@ static void x86_pmu_disable_all(void)
}
}
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -619,7 +617,7 @@ static void x86_pmu_enable_all(int added)
}
}
-static const struct pmu pmu;
+static struct pmu pmu;
static inline int is_x86_event(struct perf_event *event)
{
@@ -801,10 +799,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
hwc->last_tag == cpuc->tags[i];
}
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct perf_event *event;
@@ -840,7 +838,14 @@ void hw_perf_enable(void)
match_prev_assignment(hwc, cpuc, i))
continue;
- x86_pmu_stop(event);
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ x86_pmu_stop(event, PERF_EF_UPDATE);
}
for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +857,10 @@ void hw_perf_enable(void)
else if (i < n_running)
continue;
- x86_pmu_start(event);
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
perf_events_lapic_init();
@@ -953,15 +961,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
}
/*
- * activate a single event
+ * Add a single event to the PMU.
*
* The event is added to the group of enabled events
* but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
*/
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc;
@@ -970,58 +975,67 @@ static int x86_pmu_enable(struct perf_event *event)
hwc = &event->hw;
+ perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
- n = collect_events(cpuc, event, false);
- if (n < 0)
- return n;
+ ret = n = collect_events(cpuc, event, false);
+ if (ret < 0)
+ goto out;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be peformed
- * at commit time(->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
- goto out;
+ goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
- return ret;
+ goto out;
/*
* copy new assignment, now we know it is possible
* will be used by hw_perf_enable()
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
-out:
+done_collect:
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
- return 0;
+ ret = 0;
+out:
+ perf_pmu_enable(event->pmu);
+ return ret;
}
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx = event->hw.idx;
- if (idx == -1)
- return -EAGAIN;
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ x86_perf_event_set_period(event);
+ }
+
+ event->hw.state = 0;
- x86_perf_event_set_period(event);
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
__set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
- int ret = x86_pmu_start(event);
- WARN_ON_ONCE(ret);
}
void perf_event_print_debug(void)
@@ -1078,27 +1092,29 @@ void perf_event_print_debug(void)
local_irq_restore(flags);
}
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
-
- if (!__test_and_clear_bit(idx, cpuc->active_mask))
- return;
-
- x86_pmu.disable(event);
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- x86_perf_event_update(event);
+ if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+ x86_pmu.disable(event);
+ cpuc->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
- cpuc->events[idx] = NULL;
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ x86_perf_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
}
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int i;
@@ -1111,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event)
if (cpuc->group_flag & PERF_EVENT_TXN)
return;
- x86_pmu_stop(event);
+ x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i]) {
@@ -1134,7 +1150,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
struct perf_event *event;
- struct hw_perf_event *hwc;
int idx, handled = 0;
u64 val;
@@ -1155,7 +1170,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
}
event = cpuc->events[idx];
- hwc = &event->hw;
val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1185,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
continue;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
if (handled)
@@ -1180,25 +1194,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
return handled;
}
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1388,7 +1383,6 @@ void __init init_hw_perf_events(void)
x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
}
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
- perf_max_events = x86_pmu.num_counters;
if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,6 +1418,7 @@ void __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+ perf_pmu_register(&pmu);
perf_cpu_notifier(x86_pmu_notifier);
}
@@ -1437,10 +1432,11 @@ static inline void x86_pmu_read(struct perf_event *event)
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*/
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ perf_pmu_disable(pmu);
cpuc->group_flag |= PERF_EVENT_TXN;
cpuc->n_txn = 0;
}
@@ -1450,7 +1446,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -1460,6 +1456,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
*/
cpuc->n_added -= cpuc->n_txn;
cpuc->n_events -= cpuc->n_txn;
+ perf_pmu_enable(pmu);
}
/*
@@ -1467,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
* Perform the group schedulability test as a whole
* Return 0 if success
*/
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1486,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
memcpy(cpuc->assign, assign, n*sizeof(int));
cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+ perf_pmu_enable(pmu);
return 0;
}
-static const struct pmu pmu = {
- .enable = x86_pmu_enable,
- .disable = x86_pmu_disable,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
- .unthrottle = x86_pmu_unthrottle,
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
-};
-
/*
* validate that we can schedule this event
*/
@@ -1579,12 +1564,22 @@ out:
return ret;
}
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
{
- const struct pmu *tmp;
+ struct pmu *tmp;
int err;
- err = __hw_perf_event_init(event);
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ err = __x86_pmu_event_init(event);
if (!err) {
/*
* we temporarily connect event to its pmu
@@ -1604,26 +1599,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (err) {
if (event->destroy)
event->destroy(event);
- return ERR_PTR(err);
}
- return &pmu;
+ return err;
}
-/*
- * callchain support
- */
+static struct pmu pmu = {
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
+ .event_init = x86_pmu_event_init,
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
+};
+
+/*
+ * callchain support
+ */
static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1645,7 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- callchain_store(entry, addr);
+ perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
@@ -1656,11 +1656,15 @@ static const struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack_bp,
};
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ perf_callchain_store(entry, regs->ip);
dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
@@ -1689,7 +1693,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (fp < compat_ptr(regs->sp))
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = compat_ptr(frame.next_frame);
}
return 1;
@@ -1702,19 +1706,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
}
#endif
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct stack_frame frame;
const void __user *fp;
- if (!user_mode(regs))
- regs = task_pt_regs(current);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
fp = (void __user *)regs->bp;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->ip);
+ perf_callchain_store(entry, regs->ip);
if (perf_callchain_user32(regs, entry))
return;
@@ -1731,52 +1736,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
if ((unsigned long)fp < regs->sp)
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = frame.next_frame;
}
}
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return NULL;
- }
-
- if (in_nmi())
- entry = &__get_cpu_var(pmc_nmi_entry);
- else
- entry = &__get_cpu_var(pmc_irq_entry);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
-
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3..e421b8cd694 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
@@ -280,11 +280,11 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
struct amd_nb *nb;
int i;
- nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+ nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
if (!nb)
return NULL;
- memset(nb, 0, sizeof(*nb));
nb->nb_id = nb_id;
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d..c8f5c088cad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
struct cpu_hw_events *cpuc;
int bit, loops;
u64 status;
- int handled = 0;
+ int handled;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
intel_pmu_disable_all();
- intel_pmu_drain_bts_buffer();
+ handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
if (!status) {
intel_pmu_enable_all(0);
- return 0;
+ return handled;
}
loops = 0;
@@ -763,7 +763,7 @@ again:
data.period = event->hw.last_period;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311c..b7dcd9f2b8a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static int alloc_pebs_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ int node = cpu_to_node(cpu);
+ int max, thresh = 1; /* always use a single PEBS record */
+ void *buffer;
+
+ if (!x86_pmu.pebs)
+ return 0;
+
+ buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+ ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ ds->pebs_index = ds->pebs_buffer_base;
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+ max * x86_pmu.pebs_record_size;
+
+ ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+ thresh * x86_pmu.pebs_record_size;
+
+ return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ int node = cpu_to_node(cpu);
+ int max, thresh;
+ void *buffer;
+
+ if (!x86_pmu.bts)
+ return 0;
+
+ buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ thresh = max / 16;
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ thresh * BTS_RECORD_SIZE;
+
+ return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds || !x86_pmu.bts)
+ return;
+
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+ int node = cpu_to_node(cpu);
+ struct debug_store *ds;
+
+ ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!ds))
+ return -ENOMEM;
+
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+ kfree(ds);
+}
+
static void release_ds_buffers(void)
{
int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
return;
get_online_cpus();
-
for_each_online_cpu(cpu)
fini_debug_store_on_cpu(cpu);
for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- continue;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
-
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- kfree(ds);
+ release_pebs_buffer(cpu);
+ release_bts_buffer(cpu);
+ release_ds_buffer(cpu);
}
-
put_online_cpus();
}
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
{
- int cpu, err = 0;
+ int bts_err = 0, pebs_err = 0;
+ int cpu;
+
+ x86_pmu.bts_active = 0;
+ x86_pmu.pebs_active = 0;
if (!x86_pmu.bts && !x86_pmu.pebs)
- return 0;
+ return;
+
+ if (!x86_pmu.bts)
+ bts_err = 1;
+
+ if (!x86_pmu.pebs)
+ pebs_err = 1;
get_online_cpus();
for_each_possible_cpu(cpu) {
- struct debug_store *ds;
- void *buffer;
- int max, thresh;
+ if (alloc_ds_buffer(cpu)) {
+ bts_err = 1;
+ pebs_err = 1;
+ }
+
+ if (!bts_err && alloc_bts_buffer(cpu))
+ bts_err = 1;
+
+ if (!pebs_err && alloc_pebs_buffer(cpu))
+ pebs_err = 1;
- err = -ENOMEM;
- ds = kzalloc(sizeof(*ds), GFP_KERNEL);
- if (unlikely(!ds))
+ if (bts_err && pebs_err)
break;
- per_cpu(cpu_hw_events, cpu).ds = ds;
-
- if (x86_pmu.bts) {
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
- }
+ }
- if (x86_pmu.pebs) {
- buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
- ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
- /*
- * Always use single record PEBS
- */
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- x86_pmu.pebs_record_size;
- }
+ if (bts_err) {
+ for_each_possible_cpu(cpu)
+ release_bts_buffer(cpu);
+ }
- err = 0;
+ if (pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_pebs_buffer(cpu);
}
- if (err)
- release_ds_buffers();
- else {
+ if (bts_err && pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+ } else {
+ if (x86_pmu.bts && !bts_err)
+ x86_pmu.bts_active = 1;
+
+ if (x86_pmu.pebs && !pebs_err)
+ x86_pmu.pebs_active = 1;
+
for_each_online_cpu(cpu)
init_debug_store_on_cpu(cpu);
}
put_online_cpus();
-
- return err;
}
/*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
update_debugctlmsr(debugctlmsr);
}
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
struct pt_regs regs;
if (!event)
- return;
+ return 0;
- if (!ds)
- return;
+ if (!x86_pmu.bts_active)
+ return 0;
at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
top = (struct bts_record *)(unsigned long)ds->bts_index;
if (top <= at)
- return;
+ return 0;
ds->bts_index = ds->bts_buffer_base;
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
perf_prepare_sample(&header, &data, event, &regs);
if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
- return;
+ return 1;
for (; at < top; at++) {
data.ip = at->from;
@@ -270,6 +355,7 @@ static void intel_pmu_drain_bts_buffer(void)
/* There's new data available. */
event->hw.interrupts++;
event->pending_kill = POLL_IN;
+ return 1;
}
/*
@@ -491,7 +577,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.flags &= ~PERF_EFLAGS_EXACT;
if (perf_event_overflow(event, 1, &data, &regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
struct pebs_record_core *at, *top;
int n;
- if (!ds || !x86_pmu.pebs)
+ if (!x86_pmu.pebs_active)
return;
at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
u64 status = 0;
int bit, n;
- if (!ds || !x86_pmu.pebs)
+ if (!x86_pmu.pebs_active)
return;
at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -629,9 +715,8 @@ static void intel_ds_init(void)
#else /* CONFIG_CPU_SUP_INTEL */
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
{
- return 0;
}
static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 24901517399..81400b93e69 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
struct p4_event_bind {
unsigned int opcode; /* Event code and ESCR selector */
unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ unsigned int escr_emask; /* valid ESCR EventMask bits */
+ unsigned int shared; /* event is shared across threads */
char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
};
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
[P4_EVENT_TC_DELIVER_MODE] = {
.opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+ .shared = 1,
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_BPU_FETCH_REQUEST] = {
.opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
.escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_ITLB_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
.escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_MEMORY_CANCEL] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MEMORY_COMPLETE] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_LOAD_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_STORE_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MOB_LOAD_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
.escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_PAGE_WALK_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
.escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_CACHE_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ALLOCATION] = {
.opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_FSB_DATA_ACTIVITY] = {
.opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
.cntr = { {0, -1, -1}, {1, -1, -1} },
},
[P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_SSE_INPUT_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_64BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_128BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_X87_FP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_TC_MISC] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MISC),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_GLOBAL_POWER_EVENTS] = {
.opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_TC_MS_XFER] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_UOP_QUEUE_WRITES] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RESOURCE_STALL] = {
.opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
.escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_WC_BUFFER] = {
.opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_B2B_CYCLES] = {
.opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BNR] = {
.opcode = P4_OPCODE(P4_EVENT_BNR),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_SNOOP] = {
.opcode = P4_OPCODE(P4_EVENT_SNOOP),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_RESPONSE] = {
.opcode = P4_OPCODE(P4_EVENT_RESPONSE),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_FRONT_END_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_EXECUTION_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_REPLAY_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOPS_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOP_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
.escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MACHINE_CLEAR] = {
.opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_COMPLETED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
};
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
return config;
}
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+ /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+ if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+ if (boot_cpu_data.x86_model != 3 &&
+ boot_cpu_data.x86_model != 4 &&
+ boot_cpu_data.x86_model != 6)
+ return false;
+ }
+
+ /*
+ * For info
+ * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+ */
+
+ return true;
+}
+
static int p4_validate_raw_event(struct perf_event *event)
{
- unsigned int v;
+ unsigned int v, emask;
- /* user data may have out-of-bound event index */
+ /* User data may have out-of-bound event index */
v = p4_config_unpack_event(event->attr.config);
- if (v >= ARRAY_SIZE(p4_event_bind_map)) {
- pr_warning("P4 PMU: Unknown event code: %d\n", v);
+ if (v >= ARRAY_SIZE(p4_event_bind_map))
+ return -EINVAL;
+
+ /* It may be unsupported: */
+ if (!p4_event_match_cpu_model(v))
return -EINVAL;
+
+ /*
+ * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+ * in Architectural Performance Monitoring, it means not
+ * on _which_ logical cpu to count but rather _when_, ie it
+ * depends on logical cpu state -- count event if one cpu active,
+ * none, both or any, so we just allow user to pass any value
+ * desired.
+ *
+ * In turn we always set Tx_OS/Tx_USR bits bound to logical
+ * cpu without their propagation to another cpu
+ */
+
+ /*
+ * if an event is shared accross the logical threads
+ * the user needs special permissions to be able to use it
+ */
+ if (p4_event_bind_map[v].shared) {
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
}
+ /* ESCR EventMask bits may be invalid */
+ emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+ if (emask & ~p4_event_bind_map[v].escr_emask)
+ return -EINVAL;
+
/*
- * it may have some screwed PEBS bits
+ * it may have some invalid PEBS bits
*/
- if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
- pr_warning("P4 PMU: PEBS are not supported yet\n");
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
return -EINVAL;
- }
+
v = p4_config_unpack_metric(event->attr.config);
- if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
- pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map))
return -EINVAL;
- }
return 0;
}
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW) {
+ /*
+ * Clear bits we reserve to be managed by kernel itself
+ * and never allowed from a user space
+ */
+ event->attr.config &= P4_CONFIG_MASK;
+
rc = p4_validate_raw_event(event);
if (rc)
goto out;
/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- *
* Note that for RAW events we allow user to use P4_CCCR_RESERVED
* bits since we keep additional info here (for cache events and etc)
- *
- * XXX: HT wide things should check perf_paranoid_cpu() &&
- * CAP_SYS_ADMIN
*/
- event->hw.config |= event->attr.config &
- (p4_config_pack_escr(P4_ESCR_MASK_HT) |
- p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
- event->hw.config &= ~P4_CCCR_FORCE_OVF;
+ event->hw.config |= event->attr.config;
}
rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f849..d9f4ff8fcd6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
- return;
- wd_ops = &k7_wd_ops;
- break;
+ if (boot_cpu_data.x86 == 6 ||
+ (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+ wd_ops = &k7_wd_ops;
+ return;
case X86_VENDOR_INTEL:
/* Work around where perfctr1 doesn't have a working enable
* bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d4907951512..c7f64e6f537 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
+ { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
+ { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
+ { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
+ { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
+ { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3c..d5cd13945d5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
- vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+ vaddr = kmap_atomic_pfn(pfn);
if (!userbuf) {
memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada6..994828899e0 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;
- vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
} else
memcpy(buf, vaddr + offset, csize);
+ set_iounmap_nonlazy();
iounmap(vaddr);
return csize;
}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d..1bc7f75a5bd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (kstack_end(stack))
break;
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %08lx", *stack++);
+ printk(KERN_CONT "\n");
+ printk(KERN_CONT " %08lx", *stack++);
touch_nmi_watchdog();
}
- printk("\n");
+ printk(KERN_CONT "\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c79..6a340485249 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack >= irq_stack && stack <= irq_stack_end) {
if (stack == irq_stack_end) {
stack = (unsigned long *) (irq_stack_end[-1]);
- printk(" <EOI> ");
+ printk(KERN_CONT " <EOI> ");
}
} else {
if (((long) stack & (THREAD_SIZE-1)) == 0)
break;
}
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %016lx", *stack++);
+ printk(KERN_CONT "\n");
+ printk(KERN_CONT " %016lx", *stack++);
touch_nmi_watchdog();
}
preempt_enable();
- printk("\n");
+ printk(KERN_CONT "\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb..0c2b7ef7a34 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -15,6 +15,7 @@
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
+#include <linux/memblock.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -738,73 +739,7 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area(ei_start, ei_last, start, end,
- size, align);
-
- if (addr != -1ULL)
- return addr;
- }
- return -1ULL;
-}
-
-u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
-{
- return find_e820_area(start, end, size, align);
-}
-
-u64 __init get_max_mapped(void)
-{
- u64 end = max_pfn_mapped;
-
- end <<= PAGE_SHIFT;
-
- return end;
-}
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area_size(ei_start, ei_last, start,
- sizep, align);
-
- if (addr != -1ULL)
- return addr;
- }
-
- return -1ULL;
-}
-
-/*
- * pre allocated 4k and reserved it in e820
+ * pre allocated 4k and reserved it in memblock and e820_saved
*/
u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
{
@@ -813,8 +748,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
u64 start;
for (start = startt; ; start += size) {
- start = find_e820_area_size(start, &size, align);
- if (!(start + 1))
+ start = memblock_x86_find_in_range_size(start, &size, align);
+ if (start == MEMBLOCK_ERROR)
return 0;
if (size >= sizet)
break;
@@ -830,10 +765,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
addr = round_down(start + size - sizet, align);
if (addr < start)
return 0;
- e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+ memblock_x86_reserve_range(addr, addr + sizet, "new next");
e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820 for early_reserve_e820\n");
- update_e820();
+ printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
update_e820_saved();
return addr;
@@ -895,74 +829,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
{
return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
}
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - ((u64)ram << PAGE_SHIFT);
-}
static void early_panic(char *msg)
{
@@ -1210,3 +1076,48 @@ void __init setup_memory_map(void)
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
+
+void __init memblock_x86_fill(void)
+{
+ int i;
+ u64 end;
+
+ /*
+ * EFI may have more than 128 entries
+ * We are safe to enable resizing, beause memblock_x86_fill()
+ * is rather later for x86
+ */
+ memblock_can_resize = 1;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ end = ei->addr + ei->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+ continue;
+
+ memblock_add(ei->addr, ei->size);
+ }
+
+ memblock_analyze();
+ memblock_dump_all();
+}
+
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+ u64 free_size_pfn;
+ u64 mem_size_pfn;
+ /*
+ * need to find out used area below MAX_DMA_PFN
+ * need to use memblock to get free size in [0, MAX_DMA_PFN]
+ * at first, and assume boot_mem will not take below MAX_DMA_PFN
+ */
+ mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+ free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+ set_dma_reserve(mem_size_pfn - free_size_pfn);
+#endif
+}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf268..76b8cd953de 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
-#endif
static void __init ati_bugs(int num, int slot, int func)
{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ac..4572f25f932 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
+#include <asm/mrst.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+ if (!strncmp(buf, "mrst", 4)) {
+ mrst_early_console_init();
+ early_console_register(&early_mrst_console, keep);
+ }
+
+ if (!strncmp(buf, "hsu", 3)) {
+ hsu_early_console_init();
+ early_console_register(&early_hsu_console, keep);
+ }
+
+#endif
buf++;
}
return 0;
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 00000000000..65df603622b
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT 0x200000
+#define MRST_REGBASE_SPI0 0xff128000
+#define MRST_REGBASE_SPI1 0xff128400
+#define MRST_CLK_SPI0_REG 0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET 0
+
+#define SPI_FRF_OFFSET 4
+#define SPI_FRF_SPI 0x0
+#define SPI_FRF_SSP 0x1
+#define SPI_FRF_MICROWIRE 0x2
+#define SPI_FRF_RESV 0x3
+
+#define SPI_MODE_OFFSET 6
+#define SPI_SCPH_OFFSET 6
+#define SPI_SCOL_OFFSET 7
+#define SPI_TMOD_OFFSET 8
+#define SPI_TMOD_TR 0x0 /* xmit & recv */
+#define SPI_TMOD_TO 0x1 /* xmit only */
+#define SPI_TMOD_RO 0x2 /* recv only */
+#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET 10
+#define SPI_SRL_OFFSET 11
+#define SPI_CFS_OFFSET 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK 0x7f /* cover 7 bits */
+#define SR_BUSY (1 << 0)
+#define SR_TF_NOT_FULL (1 << 1)
+#define SR_TF_EMPT (1 << 2)
+#define SR_RF_NOT_EMPT (1 << 3)
+#define SR_RF_FULL (1 << 4)
+#define SR_TX_ERR (1 << 5)
+#define SR_DCOL (1 << 6)
+
+struct dw_spi_reg {
+ u32 ctrl0;
+ u32 ctrl1;
+ u32 ssienr;
+ u32 mwcr;
+ u32 ser;
+ u32 baudr;
+ u32 txfltr;
+ u32 rxfltr;
+ u32 txflr;
+ u32 rxflr;
+ u32 sr;
+ u32 imr;
+ u32 isr;
+ u32 risr;
+ u32 txoicr;
+ u32 rxoicr;
+ u32 rxuicr;
+ u32 msticr;
+ u32 icr;
+ u32 dmacr;
+ u32 dmatdlr;
+ u32 dmardlr;
+ u32 idr;
+ u32 version;
+
+ /* Currently operates as 32 bits, though only the low 16 bits matter */
+ u32 dr;
+} __packed;
+
+#define dw_readl(dw, name) __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ int i;
+
+ /* When run to this, we'd better re-init the HW */
+ mrst_early_console_init();
+
+ for (i = 0; i < l1; i++)
+ early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+ for (i = 0; i < l2; i++)
+ early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+ u16 config;
+
+ config = 0xc001;
+ dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+ u16 data;
+
+ data = 0x8000 | c;
+ dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+ u32 ctrlr0 = 0;
+ u32 spi0_cdiv;
+ u32 freq; /* Freqency info only need be searched once */
+
+ /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+ pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ MRST_CLK_SPI0_REG);
+ spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+ freq = 100000000 / (spi0_cdiv + 1);
+
+ if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+ mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+ pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ mrst_spi_paddr);
+
+ /* Disable SPI controller */
+ dw_writel(pspi, ssienr, 0);
+
+ /* Set control param, 8 bits, transmit only mode */
+ ctrlr0 = dw_readl(pspi, ctrl0);
+
+ ctrlr0 &= 0xfcc0;
+ ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+ | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+ dw_writel(pspi, ctrl0, ctrlr0);
+
+ /*
+ * Change the spi0 clk to comply with 115200 bps, use 100000 to
+ * calculate the clk dividor to make the clock a little slower
+ * than real baud rate.
+ */
+ dw_writel(pspi, baudr, freq/100000);
+
+ /* Disable all INT for early phase */
+ dw_writel(pspi, imr, 0x0);
+
+ /* Set the cs to spi-uart */
+ dw_writel(pspi, ser, 0x2);
+
+ /* Enable the HW, the last step for HW init */
+ dw_writel(pspi, ssienr, 0x1);
+
+ /* Set the default configuration */
+ max3110_write_config();
+
+ /* Register the kmsg dumper */
+ if (!dumper_registered) {
+ dw_dumper.dump = dw_kmsg_dump;
+ kmsg_dump_register(&dw_dumper);
+ dumper_registered = 1;
+ }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+ unsigned int timeout;
+ u32 sr;
+
+ timeout = MRST_SPI_TIMEOUT;
+ /* Early putc needs to make sure the TX FIFO is not full */
+ while (--timeout) {
+ sr = dw_readl(pspi, sr);
+ if (!(sr & SR_TF_NOT_FULL))
+ cpu_relax();
+ else
+ break;
+ }
+
+ if (!timeout)
+ pr_warning("MRST earlycon: timed out\n");
+ else
+ max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_mrst_spi_putc('\r');
+ early_mrst_spi_putc(*str);
+ str++;
+ }
+}
+
+struct console early_mrst_console = {
+ .name = "earlymrst",
+ .write = early_mrst_spi_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR 0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+ u8 lcr;
+
+ phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ HSU_PORT2_PADDR);
+
+ /* Disable FIFO */
+ writeb(0x0, phsu + UART_FCR);
+
+ /* Set to default 115200 bps, 8n1 */
+ lcr = readb(phsu + UART_LCR);
+ writeb((0x80 | lcr), phsu + UART_LCR);
+ writeb(0x18, phsu + UART_DLL);
+ writeb(lcr, phsu + UART_LCR);
+ writel(0x3600, phsu + UART_MUL*4);
+
+ writeb(0x8, phsu + UART_MCR);
+ writeb(0x7, phsu + UART_FCR);
+ writeb(0x3, phsu + UART_LCR);
+
+ /* Clear IRQ status */
+ readb(phsu + UART_LSR);
+ readb(phsu + UART_RX);
+ readb(phsu + UART_IIR);
+ readb(phsu + UART_MSR);
+
+ /* Enable FIFO */
+ writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+ unsigned int timeout = 10000; /* 10ms */
+ u8 status;
+
+ while (--timeout) {
+ status = readb(phsu + UART_LSR);
+ if (status & BOTH_EMPTY)
+ break;
+ udelay(1);
+ }
+
+ /* Only write the char when there was no timeout */
+ if (timeout)
+ writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_hsu_putc('\r');
+ early_hsu_putc(*str);
+ str++;
+ }
+}
+
+struct console early_hsu_console = {
+ .name = "earlyhsu",
+ .write = early_hsu_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2..59e175e8959 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
/* unfortunately push/pop can't be no-op */
.macro PUSH_GS
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
.endm
.macro POP_GS pop=0
addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
#else /* CONFIG_X86_32_LAZY_GS */
.macro PUSH_GS
- pushl %gs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %gs
/*CFI_REL_OFFSET gs, 0*/
.endm
.macro POP_GS pop=0
-98: popl %gs
- CFI_ADJUST_CFA_OFFSET -4
+98: popl_cfi %gs
/*CFI_RESTORE gs*/
.if \pop <> 0
add $\pop, %esp
@@ -195,35 +192,25 @@
.macro SAVE_ALL
cld
PUSH_GS
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0;*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0;*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0;*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl $(__USER_DS), %edx
movl %edx, %ds
@@ -234,39 +221,29 @@
.endm
.macro RESTORE_INT_REGS
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edx
CFI_RESTORE edx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edi
CFI_RESTORE edi
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebp
CFI_RESTORE ebp
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
CFI_RESTORE eax
.endm
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
-1: popl %ds
- CFI_ADJUST_CFA_OFFSET -4
+1: popl_cfi %ds
/*CFI_RESTORE ds;*/
-2: popl %es
- CFI_ADJUST_CFA_OFFSET -4
+2: popl_cfi %es
/*CFI_RESTORE es;*/
-3: popl %fs
- CFI_ADJUST_CFA_OFFSET -4
+3: popl_cfi %fs
/*CFI_RESTORE fs;*/
POP_GS \pop
.pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
ENTRY(ret_from_fork)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- pushl $0x0202 # Reset kernel eflags
- CFI_ADJUST_CFA_OFFSET 4
- popfl
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
+ pushl_cfi $0x0202 # Reset kernel eflags
+ popfl_cfi
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
- pushl $(__USER_DS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__USER_DS
/*CFI_REL_OFFSET ss, 0*/
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET esp, 0
- pushfl
+ pushfl_cfi
orl $X86_EFLAGS_IF, (%esp)
- CFI_ADJUST_CFA_OFFSET 4
- pushl $(__USER_CS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__USER_CS
/*CFI_REL_OFFSET cs, 0*/
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -486,8 +453,7 @@ sysenter_audit:
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
call audit_syscall_entry
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
- CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
shr $16, %edx
mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl $__ESPFIX_SS
- CFI_ADJUST_CFA_OFFSET 4
- push %eax /* new kernel esp */
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__ESPFIX_SS
+ pushl_cfi %eax /* new kernel esp */
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and
ALIGN
work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
movl %eax, %esp
#else
movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
#define PTREGSCALL3(name) \
ALIGN; \
ptregs_##name: \
+ CFI_STARTPROC; \
leal 4(%esp),%eax; \
- pushl %eax; \
+ pushl_cfi %eax; \
movl PT_EDX(%eax),%ecx; \
movl PT_ECX(%eax),%edx; \
movl PT_EBX(%eax),%eax; \
call sys_##name; \
addl $4,%esp; \
- ret
+ CFI_ADJUST_CFA_OFFSET -4; \
+ ret; \
+ CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
ALIGN;
ptregs_clone:
+ CFI_STARTPROC
leal 4(%esp),%eax
- pushl %eax
- pushl PT_EDI(%eax)
+ pushl_cfi %eax
+ pushl_cfi PT_EDI(%eax)
movl PT_EDX(%eax),%ecx
movl PT_ECX(%eax),%edx
movl PT_EBX(%eax),%eax
call sys_clone
addl $8,%esp
+ CFI_ADJUST_CFA_OFFSET -8
ret
+ CFI_ENDPROC
+ENDPROC(ptregs_clone)
.macro FIXUP_ESPFIX_STACK
/*
@@ -795,10 +763,8 @@ ptregs_clone:
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
- pushl $__KERNEL_DS
- CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__KERNEL_DS
+ pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
.endif
-1: pushl $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 4
+1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
- pushl $~(nr); \
- CFI_ADJUST_CFA_OFFSET 4; \
+ pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
@@ -893,21 +857,18 @@ ENDPROC(name)
ENTRY(coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_error
jmp error_code
CFI_ENDPROC
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl $do_general_protection
+661: pushl_cfi $do_general_protection
662:
.section .altinstructions,"a"
.balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
664:
.previous
#else
- pushl $do_simd_coprocessor_error
+ pushl_cfi $do_simd_coprocessor_error
#endif
- CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_device_not_available
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
+ pushl_cfi $do_device_not_available
jmp error_code
CFI_ENDPROC
END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
ENTRY(overflow)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_overflow
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_overflow
jmp error_code
CFI_ENDPROC
END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_bounds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_bounds
jmp error_code
CFI_ENDPROC
END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_invalid_op
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_invalid_op
jmp error_code
CFI_ENDPROC
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_segment_overrun
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
CFI_ENDPROC
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
- pushl $do_invalid_TSS
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
- pushl $do_segment_not_present
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
- pushl $do_stack_segment
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
- pushl $do_alignment_check
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0 # no error code
+ pushl_cfi $do_divide_error
jmp error_code
CFI_ENDPROC
END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl machine_check_vector
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi machine_check_vector
jmp error_code
CFI_ENDPROC
END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_spurious_interrupt_bug
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_spurious_interrupt_bug
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
SAVE_ALL
TRACE_IRQS_OFF
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
- CFI_ADJUST_CFA_OFFSET 4
+5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
ENTRY(page_fault)
RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_page_fault
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
cld
movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
movl TSS_sysenter_sp0 + \offset(%esp), %esp
CFI_DEF_CFA esp, 0
CFI_UNDEFINED eip
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- pushl $__KERNEL_CS
- CFI_ADJUST_CFA_OFFSET 4
- pushl $sysenter_past_esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushfl_cfi
+ pushl_cfi $__KERNEL_CS
+ pushl_cfi $sysenter_past_esp
CFI_REL_OFFSET eip, 0
.endm
@@ -1377,8 +1299,7 @@ ENTRY(debug)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # error code 0
@@ -1398,32 +1319,27 @@ END(debug)
*/
ENTRY(nmi)
RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
je nmi_espfix_stack
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
*/
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
jae nmi_stack_correct
cmpl $ia32_sysenter_target,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
/* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
*
* create the pointer to lss back
*/
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ss
+ pushl_cfi %esp
addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi 16(%esp)
.endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
ENTRY(int3)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
@@ -1490,8 +1401,7 @@ END(int3)
ENTRY(general_protection)
RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_general_protection
jmp error_code
CFI_ENDPROC
END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbb..fe2690d71c0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_DS /* ss */
/*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi \child_rip /* rip */
CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* orig rax */
.endm
.macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
+ pushq_cfi kernel_eflags(%rip)
+ popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -521,11 +513,9 @@ sysret_careful:
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
jmp sysret_check
/* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
@@ -714,9 +700,8 @@ END(ptregscall_common)
ENTRY(stub_execve)
CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
+ addq $8, %rsp
+ PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
+ PARTIAL_FRAME 0
SAVE_REST
movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -8
.endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
+1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -796,8 +780,8 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
+ subq $ORIG_RAX-ARGOFFSET+8, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
call save_args
PARTIAL_FRAME 0
call \func
@@ -822,6 +806,7 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
.macro apicinterrupt num sym do_sym
ENTRY(\sym)
INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $~(\num)
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
@@ -981,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
- invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
- invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
- invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
- invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
- invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
- invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
- invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
- invalidate_interrupt7 smp_invalidate_interrupt
+.irpc idx, "01234567"
+apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
+ invalidate_interrupt\idx smp_invalidate_interrupt
+.endr
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1023,9 +993,9 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+ irq_work_interrupt smp_irq_work_interrupt
#endif
/*
@@ -1036,8 +1006,8 @@ ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1052,9 +1022,9 @@ END(\sym)
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
@@ -1070,9 +1040,9 @@ END(\sym)
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
@@ -1089,8 +1059,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1107,8 +1077,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
TRACE_IRQS_OFF
@@ -1139,16 +1109,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
/* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
+ pushfq_cfi
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
+ popfq_cfi
ret
CFI_ENDPROC
END(native_load_gs_index)
@@ -1215,8 +1183,7 @@ END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1192,7 @@ ENTRY(call_softirq)
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1336,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
/* ebx: no swapgs flag */
ENTRY(paranoid_exit)
- INTR_FRAME
+ DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %ebx,%ebx /* swapgs needed? */
@@ -1445,7 +1413,6 @@ error_swapgs:
error_sti:
TRACE_IRQS_OFF
ret
- CFI_ENDPROC
/*
* There are two places in the kernel that can potentially fault with
@@ -1470,6 +1437,7 @@ bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
jmp error_swapgs
+ CFI_ENDPROC
END(error_entry)
@@ -1498,8 +1466,8 @@ ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54e..3afb33f14d2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
return mod_code_status;
}
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
static unsigned char *ftrace_nop_replace(void)
{
- return ftrace_nop;
+ return ideal_nop5;
}
static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
int __init ftrace_dyn_arch_init(void *data)
{
- extern const unsigned char ftrace_test_p6nop[];
- extern const unsigned char ftrace_test_nop5[];
- extern const unsigned char ftrace_test_jmp[];
- int faulted = 0;
-
- /*
- * There is no good nop for all x86 archs.
- * We will default to using the P6_NOP5, but first we
- * will test to make sure that the nop will actually
- * work on this CPU. If it faults, we will then
- * go to a lesser efficient 5 byte nop. If that fails
- * we then just use a jmp as our nop. This isn't the most
- * efficient nop, but we can not use a multi part nop
- * since we would then risk being preempted in the middle
- * of that nop, and if we enabled tracing then, it might
- * cause a system crash.
- *
- * TODO: check the cpuid to determine the best nop.
- */
- asm volatile (
- "ftrace_test_jmp:"
- "jmp ftrace_test_p6nop\n"
- "nop\n"
- "nop\n"
- "nop\n" /* 2 byte jmp + 3 bytes */
- "ftrace_test_p6nop:"
- P6_NOP5
- "jmp 1f\n"
- "ftrace_test_nop5:"
- ".byte 0x66,0x66,0x66,0x66,0x90\n"
- "1:"
- ".section .fixup, \"ax\"\n"
- "2: movl $1, %0\n"
- " jmp ftrace_test_nop5\n"
- "3: movl $2, %0\n"
- " jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(ftrace_test_p6nop, 2b)
- _ASM_EXTABLE(ftrace_test_nop5, 3b)
- : "=r"(faulted) : "0" (faulted));
-
- switch (faulted) {
- case 0:
- pr_info("converting mcount calls to 0f 1f 44 00 00\n");
- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
- break;
- case 1:
- pr_info("converting mcount calls to 66 66 66 66 90\n");
- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
- break;
- case 2:
- pr_info("converting mcount calls to jmp . + 5\n");
- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
- break;
- }
-
/* The return code is retured via data */
*(unsigned long *)data = 0;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9..af0699ba48c 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/memblock.h>
#include <asm/setup.h>
#include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
lowmem = 0x9f000;
/* reserve all memory between lowmem and the 1MB mark */
- reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+ memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625..763310165fa 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
#include <linux/mm.h>
+#include <linux/memblock.h>
#include <asm/setup.h>
#include <asm/sections.h>
@@ -17,6 +18,7 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
static void __init i386_default_early_setup(void)
{
@@ -30,17 +32,18 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
+ memblock_init();
+
#ifdef CONFIG_X86_TRAMPOLINE
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
- reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
- "EX TRAMPOLINE");
+ memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
#endif
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -49,7 +52,7 @@ void __init i386_start_kernel(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd61..2d2673c28af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
#include <linux/percpu.h>
#include <linux/start_kernel.h>
#include <linux/io.h>
+#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -79,6 +80,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
/* Cleanup the over mapped high alias */
cleanup_highmap();
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -98,7 +101,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_init();
+
+ memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -107,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09f..bcece91dd31 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -183,13 +183,12 @@ default_entry:
#ifdef CONFIG_X86_PAE
/*
- * In PAE mode swapper_pg_dir is statically defined to contain enough
- * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
- * entries). The identity mapping is handled by pointing two PGD
- * entries to the first kernel PMD.
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD.
*
- * Note the upper half of each PMD or PTE are always zero at
- * this stage.
+ * Note the upper half of each PMD or PTE are always zero at this stage.
*/
#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +196,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_pmd), %edx
+ movl $pa(initial_pg_pmd), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -226,14 +225,14 @@ default_entry:
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_dir), %edx
+ movl $pa(initial_page_table), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -257,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_dir+0xffc)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_page_table+0xffc)
#endif
jmp 3f
/*
@@ -334,7 +333,7 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl pa(initial_page_table), %eax
+ movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $X86_CR0_PG,%eax
@@ -614,8 +613,6 @@ ignore_int:
.align 4
ENTRY(initial_code)
.long i386_start_kernel
-ENTRY(initial_page_table)
- .long pa(swapper_pg_dir)
/*
* BSS section
@@ -623,20 +620,18 @@ ENTRY(initial_page_table)
__PAGE_ALIGNED_BSS
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
.fill 1024,4,0
#endif
-swapper_pg_fixmap:
+initial_pg_fixmap:
.fill 1024,4,0
-#ifdef CONFIG_X86_TRAMPOLINE
-ENTRY(trampoline_pg_dir)
- .fill 1024,4,0
-#endif
ENTRY(empty_zero_page)
.fill 4096,1,0
+ENTRY(swapper_pg_dir)
+ .fill 1024,4,0
/*
* This starts the data section.
@@ -645,20 +640,20 @@ ENTRY(empty_zero_page)
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+ENTRY(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b..ae03cab4352 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -380,44 +380,35 @@ static int hpet_next_event(unsigned long delta,
struct clock_event_device *evt, int timer)
{
u32 cnt;
+ s32 res;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
- * We need to read back the CMP register on certain HPET
- * implementations (ATI chipsets) which seem to delay the
- * transfer of the compare register into the internal compare
- * logic. With small deltas this might actually be too late as
- * the counter could already be higher than the compare value
- * at that point and we would wait for the next hpet interrupt
- * forever. We found out that reading the CMP register back
- * forces the transfer so we can rely on the comparison with
- * the counter register below. If the read back from the
- * compare register does not match the value we programmed
- * then we might have a real hardware problem. We can not do
- * much about it here, but at least alert the user/admin with
- * a prominent warning.
- *
- * An erratum on some chipsets (ICH9,..), results in
- * comparator read immediately following a write returning old
- * value. Workaround for this is to read this value second
- * time, when first read returns old value.
- *
- * In fact the write to the comparator register is delayed up
- * to two HPET cycles so the workaround we tried to restrict
- * the readback to those known to be borked ATI chipsets
- * failed miserably. So we give up on optimizations forever
- * and penalize all HPET incarnations unconditionally.
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). We worked around that by reading
+ * back the compare register, but that required another
+ * workaround for ICH9,10 chips where the first readout after
+ * write can return the old stale value. We already have a
+ * minimum delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than 8 HPET cycles
+ * away from the event or if the counter is already ahead of
+ * the event.
*/
- if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
- if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
- printk_once(KERN_WARNING
- "hpet: compare register read back failed.\n");
- }
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+ return res < 8 ? -ETIME : 0;
}
static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -440,9 +431,9 @@ static int hpet_legacy_next_event(unsigned long delta,
static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
static struct hpet_dev *hpet_devs;
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
/* unmask it */
@@ -451,10 +442,10 @@ void hpet_msi_unmask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
{
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
- struct hpet_dev *hdev = get_irq_data(irq);
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +453,14 @@ void hpet_msi_mask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
}
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
msg->address_hi = 0;
@@ -726,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_ONLINE:
- INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
+ INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
init_completion(&work.complete);
/* FIXME: add schedule_work_on() */
schedule_delayed_work_on(cpu, &work.work, 0);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0..58bb239a2fd 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
*/
if (!HAVE_HWFP) {
+ /*
+ * Disable xsave as we do not support it if i387
+ * emulation is enabled.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
xstate_size = sizeof(struct i387_soft_struct);
return;
}
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
else
xstate_size = sizeof(struct i387_fsave_struct);
-#endif
}
-#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
void __cpuinit fpu_init(void)
{
- unsigned long oldcr0 = read_cr0();
-
- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+ if (cpu_has_fxsr)
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (cpu_has_xmm)
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ set_in_cr4(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!HAVE_HWFP)
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
if (!smp_processor_id())
init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
clear_used_math();
}
-#else /* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
- if (!smp_processor_id())
- init_thread_xstate();
-}
-
-#endif /* CONFIG_X86_32 */
-
void fpu_finit(struct fpu *fpu)
{
-#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
finit_soft_fpu(&fpu->state->soft);
return;
}
-#endif
if (cpu_has_fxsr) {
struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
#ifdef CONFIG_X86_64
env->fip = fxsave->rip;
env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+ savesegment(ds, env->fos);
} else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
+ env->fos = tsk->thread.ds;
}
+ env->fos |= 0xffff0000;
#else
env->fip = fxsave->fip;
env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac9..20757cb2efa 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
*/
+static void init_8259A(int auto_eoi);
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
/*
* 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
@@ -117,7 +113,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
+ i8259A_chip.name);
enable_irq(irq);
}
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
{
+ unsigned int irq = data->irq;
unsigned int irqmask = 1 << irq;
unsigned long flags;
@@ -223,6 +220,14 @@ spurious_8259A_irq:
}
}
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
static char irq_trigger[2];
/**
* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
- i8259A_chip.mask_ack = disable_8259A_irq;
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
static void legacy_pic_noop(void) { };
static void legacy_pic_uint_noop(unsigned int unused) { };
static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip = {
- .name = "dummy pic",
- .mask = legacy_pic_uint_noop,
- .unmask = legacy_pic_uint_noop,
- .disable = legacy_pic_uint_noop,
- .mask_ack = legacy_pic_uint_noop,
-};
static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
- .chip = &dummy_pic_chip,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
- .mask_all = mask_8259A,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
.irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18..83ec0175f98 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
+ seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_printf(p, " IRQ work interrupts\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
seq_printf(p, "-%-8s", desc->name);
if (action) {
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
@@ -282,6 +282,7 @@ void fixup_irqs(void)
unsigned int irq, vector;
static int warned;
struct irq_desc *desc;
+ struct irq_data *data;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
/* interrupt's are disabled at this point */
raw_spin_lock(&desc->lock);
- affinity = desc->affinity;
+ data = &desc->irq_data;
+ affinity = data->affinity;
if (!irq_has_action(irq) ||
cpumask_equal(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
affinity = cpu_all_mask;
}
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
- desc->chip->mask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
+ data->chip->irq_mask(data);
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
+ if (data->chip->irq_set_affinity)
+ data->chip->irq_set_affinity(data, affinity, true);
else if (!(warned++))
set_affinity = 0;
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
- desc->chip->unmask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
+ data->chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
@@ -355,10 +357,10 @@ void fixup_irqs(void)
if (irr & (1 << (vector % 32))) {
irq = __get_cpu_var(vector_irq)[vector];
- desc = irq_to_desc(irq);
+ data = irq_get_irq_data(irq);
raw_spin_lock(&desc->lock);
- if (desc->chip->retrigger)
- desc->chip->retrigger(irq);
+ if (data->chip->irq_retrigger)
+ data->chip->irq_retrigger(data);
raw_spin_unlock(&desc->lock);
}
}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d16..96656f20775 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/percpu.h>
+#include <linux/mm.h>
#include <asm/apic.h>
@@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-#ifdef CONFIG_4KSTACKS
/*
* per-CPU IRQ handling contexts (thread information and stack)
*/
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
+} __attribute__((aligned(THREAD_SIZE)));
static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
-
static void call_on_stack(void *func, void *stack)
{
asm volatile("xchgl %%ebx,%%esp \n"
@@ -129,7 +126,9 @@ void __cpuinit irq_ctx_init(int cpu)
if (per_cpu(hardirq_ctx, cpu))
return;
- irqctx = &per_cpu(hardirq_stack, cpu);
+ irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREAD_FLAGS,
+ THREAD_ORDER));
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
@@ -138,7 +137,9 @@ void __cpuinit irq_ctx_init(int cpu)
per_cpu(hardirq_ctx, cpu) = irqctx;
- irqctx = &per_cpu(softirq_stack, cpu);
+ irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREAD_FLAGS,
+ THREAD_ORDER));
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
@@ -151,11 +152,6 @@ void __cpuinit irq_ctx_init(int cpu)
cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
-void irq_ctx_exit(int cpu)
-{
- per_cpu(hardirq_ctx, cpu) = NULL;
-}
-
asmlinkage void do_softirq(void)
{
unsigned long flags;
@@ -187,11 +183,6 @@ asmlinkage void do_softirq(void)
local_irq_restore(flags);
}
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
-#endif
-
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 00000000000..ca8f703a1e7
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (!cpu_has_apic)
+ return;
+
+ apic->send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc57..c752e973958 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
void __init init_ISA_irqs(void)
{
+ struct irq_chip *chip = legacy_pic->chip;
+ const char *name = chip->name;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
#endif
legacy_pic->init(0);
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
}
void __init init_IRQ(void)
@@ -224,9 +215,9 @@ static void __init apic_intr_init(void)
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+ /* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+ alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
# endif
#endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 00000000000..961b6b30ba9
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+ char code[JUMP_LABEL_NOP_SIZE];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ union jump_code_union code;
+
+ if (type == JUMP_LABEL_ENABLE) {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ } else
+ memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+ text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f1..90fcf62854b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
.open = setup_data_open,
+ .llseek = default_llseek,
};
static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a3..ec592caac4b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -387,7 +387,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
* disable hardware debugging while it is processing gdb packets or
* handling exception.
*/
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
{
int i;
int cpu = raw_smp_processor_id();
@@ -477,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
raw_smp_processor_id());
}
- kgdb_correct_hw_break();
-
return 0;
}
@@ -621,7 +619,12 @@ int kgdb_arch_init(void)
static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
struct perf_sample_data *data, struct pt_regs *regs)
{
- kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (breakinfo[i].enabled)
+ tsk->thread.debugreg6 |= (DR_TRAP0 << i);
}
void kgdb_arch_late(void)
@@ -644,7 +647,7 @@ void kgdb_arch_late(void)
if (breakinfo[i].pev)
continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
- if (IS_ERR(breakinfo[i].pev)) {
+ if (IS_ERR((void * __force)breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
@@ -721,6 +724,7 @@ struct kgdb_arch arch_kgdb_ops = {
.flags = KGDB_HW_BREAKPOINT,
.set_hw_breakpoint = kgdb_set_hw_break,
.remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
.remove_all_hw_break = kgdb_remove_all_hw_break,
.correct_hw_break = kgdb_correct_hw_break,
};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e..1cbd54c0df9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
return 0;
}
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
/* Check if paddr is at an instruction boundary */
static int __kprobes can_probe(unsigned long paddr)
{
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
return 0;
/* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
*(unsigned long *)addr = val;
}
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
{
asm volatile (
".global optprobe_template_entry\n"
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether the address range is reserved */
if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1))
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
return -EBUSY;
return len;
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
unsigned long addr, size = 0, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- /* Dummy buffers for lookup_symbol_attrs */
- static char __dummy_buf[KSYM_NAME_LEN];
/* Lookup symbol including addr */
- if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c..ca43ce31a19 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
- int low, high;
+ int low, high, ret;
+
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+ ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
- return native_write_msr_safe(msr_kvm_system_time, low, high);
+ return ret;
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c52918..b3ea9db39db 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pud = (pud_t *)page_address(page);
- memset(pud, 0, PAGE_SIZE);
+ clear_page(pud);
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pmd = (pmd_t *)page_address(page);
- memset(pmd, 0, PAGE_SIZE);
+ clear_page(pmd);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7..ce0cb4721c9 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -212,7 +212,7 @@ static int install_equiv_cpu_table(const u8 *buf)
return 0;
}
- equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+ equiv_cpu_table = vmalloc(size);
if (!equiv_cpu_table) {
pr_err("failed to allocate equivalent CPU table\n");
return 0;
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c1..1cca374a2ba 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
* Software Developer's Manual
* Order Number 253668 or free download from:
*
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
+ .llseek = no_llseek,
};
static struct miscdevice microcode_dev = {
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a9..dcb65cc0a05 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
* Software Developer's Manual
* Order Number 253668 or free download from:
*
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd4..6da143c2a6b 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -217,13 +217,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
wrmsrl(address, val);
}
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
{
pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
return 0;
}
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
@@ -234,7 +234,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{}
};
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a __cpuinit function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c55096..8f295609173 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
apply_paravirt(pseg, pseg + para->sh_size);
}
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
return 0;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d7b6f7fb4fe..9af64d9c4b6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
@@ -657,7 +658,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
- reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+ memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -686,7 +687,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
+ memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
if (mpf->physptr)
smp_reserve_memory(mpf);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c0..c5b250011fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
.alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d..f56a117cef6 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
#include <asm/rio.h>
#include <asm/bios_ebda.h>
#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
return 0;
}
-void __init detect_calgary(void)
+int __init detect_calgary(void)
{
int bus;
void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
* another HW IOMMU already, bail out.
*/
if (no_iommu || iommu_detected)
- return;
+ return -ENODEV;
if (!use_calgary)
- return;
+ return -ENODEV;
if (!early_pci_allowed())
- return;
+ return -ENODEV;
printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
if (!rio_table_hdr) {
printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
"in EBDA - bailing!\n");
- return;
+ return -ENODEV;
}
ret = build_detail_arrays();
if (ret) {
printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return;
+ return -ENOMEM;
}
specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
x86_init.iommu.iommu_init = calgary_iommu_init;
}
- return;
+ return calgary_found;
cleanup:
for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
if (info->tce_space)
free_tce_table(info->tce_space);
}
+ return -ENOMEM;
}
static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
* and before device_initcall.
*/
rootfs_initcall(calgary_fixup_tce_spaces);
+
+IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a..9ea999a4dcc 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/calgary.h>
-#include <asm/amd_iommu.h>
#include <asm/x86_init.h>
-#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
static int forbid_dac __read_mostly;
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
/* Dummy device used for NULL arguments (normally ISA). */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
@@ -130,26 +131,24 @@ static void __init dma32_free_bootmem(void)
void __init pci_iommu_alloc(void)
{
+ struct iommu_table_entry *p;
+
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
- if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
- goto out;
-
- gart_iommu_hole_init();
-
- detect_calgary();
-
- detect_intel_iommu();
+ sort_iommu_table(__iommu_table, __iommu_table_end);
+ check_iommu_entries(__iommu_table, __iommu_table_end);
- /* needs to be called after gart_iommu_hole_init */
- amd_iommu_detect();
-out:
- pci_xen_swiotlb_init();
-
- pci_swiotlb_init();
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && p->detect && p->detect() > 0) {
+ p->flags |= IOMMU_DETECTED;
+ if (p->early_init)
+ p->early_init();
+ if (p->flags & IOMMU_FINISH_IF_DETECTED)
+ break;
+ }
+ }
}
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@@ -292,6 +291,7 @@ EXPORT_SYMBOL(dma_supported);
static int __init pci_iommu_init(void)
{
+ struct iommu_table_entry *p;
dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
#ifdef CONFIG_PCI
@@ -299,12 +299,10 @@ static int __init pci_iommu_init(void)
#endif
x86_init.iommu.iommu_init();
- if (swiotlb || xen_swiotlb) {
- printk(KERN_INFO "PCI-DMA: "
- "Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_print_info();
- } else
- swiotlb_free();
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+ p->late_init();
+ }
return 0;
}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa6..ba0f0ca9f28 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,8 +39,9 @@
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -560,8 +561,11 @@ static void enable_gart_translations(void)
{
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
enable_gart_translation(dev, __pa(agp_gatt_table));
}
@@ -592,16 +596,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
if (!fix_up_north_bridges)
return;
+ if (!k8_northbridges.gart_supported)
+ return;
+
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
/*
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
@@ -649,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < num_k8_northbridges; i++) {
- dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ dev = k8_northbridges.nb_misc[i];
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -718,10 +725,13 @@ static void gart_iommu_shutdown(void)
if (!no_agp)
return;
- for (i = 0; i < num_k8_northbridges; i++) {
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 ctl;
- dev = k8_northbridges[i];
+ dev = k8_northbridges.nb_misc[i];
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -739,7 +749,7 @@ int __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (num_k8_northbridges == 0)
+ if (!k8_northbridges.gart_supported)
return 0;
#ifndef CONFIG_AGP_AMD64
@@ -896,3 +906,4 @@ void __init gart_parse_options(char *p)
}
}
}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 00000000000..55d745ec118
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,89 @@
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+
+
+#define DEBUG 1
+
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish,
+ struct iommu_table_entry *q)
+{
+ struct iommu_table_entry *p;
+
+ if (!q)
+ return NULL;
+
+ for (p = start; p < finish; p++)
+ if (p->detect == q->depend)
+ return p;
+
+ return NULL;
+}
+
+
+void __init sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish) {
+
+ struct iommu_table_entry *p, *q, tmp;
+
+ for (p = start; p < finish; p++) {
+again:
+ q = find_dependents_of(start, finish, p);
+ /* We are bit sneaky here. We use the memory address to figure
+ * out if the node we depend on is past our point, if so, swap.
+ */
+ if (q > p) {
+ tmp = *p;
+ memmove(p, q, sizeof(*p));
+ *q = tmp;
+ goto again;
+ }
+ }
+
+}
+
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+ struct iommu_table_entry *p, *q, *x;
+ char sym_p[KSYM_SYMBOL_LEN];
+ char sym_q[KSYM_SYMBOL_LEN];
+
+ /* Simple cyclic dependency checker. */
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(start, finish, p);
+ x = find_dependents_of(start, finish, q);
+ if (p == x) {
+ sprint_symbol(sym_p, (unsigned long)p->detect);
+ sprint_symbol(sym_q, (unsigned long)q->detect);
+
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
+ " on %s and vice-versa. BREAKING IT.\n",
+ sym_p, sym_q);
+ /* Heavy handed way..*/
+ x->depend = 0;
+ }
+ }
+
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(p, finish, p);
+ if (q && q > p) {
+ sprint_symbol(sym_p, (unsigned long)p->detect);
+ sprint_symbol(sym_q, (unsigned long)q->detect);
+
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
+ "should be called before %s!\n",
+ sym_p, sym_q);
+ }
+ }
+}
+#else
+inline void check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+}
+#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d432..8f972cbddef 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
-
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
int swiotlb __read_mostly;
static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
};
/*
- * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
*
* This returns non-zero if we are forced to use swiotlb (by the boot
* option).
*/
-int __init pci_swiotlb_detect(void)
+int __init pci_swiotlb_detect_override(void)
{
int use_swiotlb = swiotlb | swiotlb_force;
+ if (swiotlb_force)
+ swiotlb = 1;
+
+ return use_swiotlb;
+}
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+ pci_xen_swiotlb_detect,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+/*
+ * if 4GB or more detected (and iommu=off not set) return 1
+ * and set swiotlb to 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
- if (swiotlb_force)
- swiotlb = 1;
-
- return use_swiotlb;
+ return swiotlb;
}
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+ pci_swiotlb_detect_override,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
void __init pci_swiotlb_init(void)
{
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
dma_ops = &swiotlb_dma_ops;
}
}
+
+void __init pci_swiotlb_late_init(void)
+{
+ /* An IOMMU turned us off. */
+ if (!swiotlb)
+ swiotlb_free();
+ else {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ }
+}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f199..00000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
- u32 a, b;
- for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
- a == b;
- b = inl(pmtmr_ioport) & ACPI_PM_MASK)
- cpu_relax();
- return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
- u32 a, b;
- a = pmtimer_wait_tick();
- do {
- b = inl(pmtmr_ioport);
- cpu_relax();
- } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
- pmtmr_ioport = 0;
- return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd..b3d7a3a04f3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);
/* Must be after DS reload */
- unlazy_fpu(prev_p);
+ __unlazy_fpu(prev_p);
/* Make sure cpu is ready for new context */
if (preload_fpu)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8a..45892dc4b72 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child)
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
{
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
unsigned long tmp;
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
tmp = 0; /* Default return condition */
@@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case PTRACE_GET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_get_thread_area(child, addr,
- (struct user_desc __user *) data);
+ (struct user_desc __user *)data);
break;
case PTRACE_SET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_set_thread_area(child, addr,
- (struct user_desc __user *) data, 0);
+ (struct user_desc __user *)data, 0);
break;
#endif
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02a..008b91eefa1 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,48 +41,11 @@ void pvclock_set_flags(u8 flags)
valid_flags = flags;
}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+ return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+ shadow->tsc_shift);
}
/*
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245..8bbe8c56916 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
vt8237_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
static void ati_force_hpet_resume(void)
{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83..c495aa8d481 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
}
/* we will leave sorting out the final value
when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
+ have detected BSP APIC ID or smp_num_cpu */
break;
#endif /* CONFIG_SMP */
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length)
CMOS_WRITE(0x00, 0x8f);
spin_unlock(&rtc_lock);
- /* Remap the kernel at virtual address zero, as well as offset zero
- from the kernel segment. This assumes the kernel segment starts at
- virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
/*
- * Use `swapper_pg_dir' as our page directory.
+ * Switch back to the initial page table.
*/
- load_cr3(swapper_pg_dir);
+ load_cr3(initial_page_table);
/* Write 0x1234 to absolute memory location 0x472. The BIOS reads
this on booting to tell it to "Bypass memory test (also warm
@@ -641,7 +635,7 @@ void native_machine_shutdown(void)
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
*/
- smp_send_stop();
+ stop_other_cpus();
#endif
lapic_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..21c6746338a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
#include <linux/apm_bios.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/console.h>
#include <linux/mca.h>
@@ -83,7 +84,6 @@
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
-#include <asm/vmi.h>
#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
@@ -107,11 +107,12 @@
#include <asm/percpu.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#ifdef CONFIG_X86_64
#include <asm/numa_64.h>
#endif
#include <asm/mce.h>
+#include <asm/alternative.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -125,7 +126,6 @@ unsigned long max_pfn_mapped;
RESERVE_BRK(dmi_alloc, 65536);
#endif
-unsigned int boot_cpu_id __read_mostly;
static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
@@ -302,7 +302,7 @@ static inline void init_gbpages(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -324,17 +324,16 @@ static void __init relocate_initrd(void)
char *p, *q;
/* We need to move the initrd down into lowmem */
- ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
+ ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
PAGE_SIZE);
- if (ramdisk_here == -1ULL)
+ if (ramdisk_here == MEMBLOCK_ERROR)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- reserve_early(ramdisk_here, ramdisk_here + area_size,
- "NEW RAMDISK");
+ memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -390,7 +389,7 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- free_early(ramdisk_image, ramdisk_end);
+ memblock_x86_free_range(ramdisk_image, ramdisk_end);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
@@ -413,7 +412,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- free_early(ramdisk_image, ramdisk_end);
+ memblock_x86_free_range(ramdisk_image, ramdisk_end);
}
#else
static void __init reserve_initrd(void)
@@ -469,7 +468,7 @@ static void __init e820_reserve_setup_data(void)
e820_print_map("reserve setup_data");
}
-static void __init reserve_early_setup_data(void)
+static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_data *data;
u64 pa_data;
@@ -481,7 +480,7 @@ static void __init reserve_early_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
pa_data = data->next;
early_iounmap(data, sizeof(*data));
}
@@ -502,6 +501,7 @@ static inline unsigned long long get_total_mem(void)
return total << PAGE_SHIFT;
}
+#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
static void __init reserve_crashkernel(void)
{
unsigned long long total_mem;
@@ -519,23 +519,27 @@ static void __init reserve_crashkernel(void)
if (crash_base <= 0) {
const unsigned long long alignment = 16<<20; /* 16M */
- crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
- alignment);
- if (crash_base == -1ULL) {
+ /*
+ * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
+ */
+ crash_base = memblock_find_in_range(alignment,
+ DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
+
+ if (crash_base == MEMBLOCK_ERROR) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
} else {
unsigned long long start;
- start = find_e820_area(crash_base, ULONG_MAX, crash_size,
- 1<<20);
+ start = memblock_find_in_range(crash_base,
+ crash_base + crash_size, crash_size, 1<<20);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
- reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -615,82 +619,10 @@ static __init void reserve_ibft_region(void)
addr = find_ibft_region(&size);
if (size)
- reserve_early_overlap_ok(addr, addr + size, "ibft");
+ memblock_x86_reserve_range(addr, addr + size, "* ibft");
}
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working around it.\n",
- d->ident);
-
- e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
- return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix/MSC BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
- },
- },
- /*
- * AMI BIOS with low memory corruption was found on Intel DG45ID and
- * DG45FC boards.
- * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
- * match only DMI_BOARD_NAME and see if there is more bad products
- * with this vendor.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
- },
- },
- /*
- * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
- * match on the product name.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
- },
- },
-#endif
- {}
-};
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
static void __init trim_bios_range(void)
{
@@ -698,8 +630,14 @@ static void __init trim_bios_range(void)
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+ E820_RAM, E820_RESERVED);
+
/*
* special case: Some BIOSen report the PC BIOS
* area (640->1Mb) as ram even though it is not.
@@ -709,6 +647,37 @@ static void __init trim_bios_range(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+static int __init parse_reservelow(char *p)
+{
+ unsigned long long size;
+
+ if (!p)
+ return -EINVAL;
+
+ size = memparse(p, &p);
+
+ if (size < 4096)
+ size = 4096;
+
+ if (size > 640*1024)
+ size = 640*1024;
+
+ reserve_low = size;
+
+ return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
+static u64 __init get_max_mapped(void)
+{
+ u64 end = max_pfn_mapped;
+
+ end <<= PAGE_SHIFT;
+
+ return end;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -726,18 +695,30 @@ void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
int k8 = 0;
+ unsigned long flags;
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
+
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
-
- /* OFW also may relocate the fixmap */
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
olpc_ofw_detect();
early_trap_init();
@@ -782,12 +763,14 @@ void __init setup_arch(char **cmdline_p)
#endif
4)) {
efi_enabled = 1;
- efi_reserve_early();
+ efi_memblock_x86_reserve_range();
}
#endif
x86_init.oem.arch_setup();
+ resource_alloc_from_bottom = 0;
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
/* update the e820_saved too */
@@ -838,11 +821,8 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
-
/* after early param, so could get panic from serial */
- reserve_early_setup_data();
+ memblock_x86_reserve_range_setup_data();
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
@@ -863,8 +843,6 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
- dmi_check_system(bad_bios_dmi_table);
-
/*
* VMware detection requires dmi to be available, so this
* needs to be done after dmi_scan_machine, for the BP.
@@ -897,8 +875,6 @@ void __init setup_arch(char **cmdline_p)
*/
max_pfn = e820_end_of_ram_pfn();
- /* preallocate 4k for mptable mpc */
- early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
mtrr_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
@@ -920,18 +896,8 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
- setup_bios_corruption_check();
-#endif
-
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
-
- reserve_brk();
-
/*
* Find and reserve possible boot-time SMP configuration:
*/
@@ -939,6 +905,26 @@ void __init setup_arch(char **cmdline_p)
reserve_ibft_region();
+ /*
+ * Need to conclude brk, before memblock_x86_fill()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
+ reserve_brk();
+
+ memblock.current_limit = get_max_mapped();
+ memblock_x86_fill();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
+ printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+
reserve_trampoline_memory();
#ifdef CONFIG_ACPI_SLEEP
@@ -962,6 +948,7 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
}
#endif
+ memblock.current_limit = get_max_mapped();
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -1000,10 +987,7 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
-#ifndef CONFIG_NO_BOOTMEM
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#endif
-
+ memblock_find_dma_reserve();
dma32_reserve_bootmem();
#ifdef CONFIG_KVM_CLOCK
@@ -1014,7 +998,12 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);
- setup_trampoline_page_table();
+#ifdef CONFIG_X86_32
+ /* sync back kernel address range */
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+#endif
tboot_probe();
@@ -1071,6 +1060,10 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.banner();
mcheck_init();
+
+ local_irq_save(flags);
+ arch_init_ideal_nop5();
+ local_irq_restore(flags);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae645..002b79685f7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
-#ifdef CONFIG_NO_BOOTMEM
- u64 start = __pa(ptr);
- u64 end = start + size;
- free_early_partial(start, end);
-#else
free_bootmem(__pa(ptr), size);
-#endif
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -253,7 +247,7 @@ void __init setup_per_cpu_areas(void)
* Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
- if (cpu == boot_cpu_id)
+ if (!cpu)
switch_to_new_gdt(cpu);
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d..513deac7228 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void)
irq_exit();
}
-static void native_smp_send_stop(void)
+static void native_stop_other_cpus(int wait)
{
unsigned long flags;
- unsigned long wait;
+ unsigned long timeout;
if (reboot_force)
return;
@@ -179,9 +179,12 @@ static void native_smp_send_stop(void)
if (num_online_cpus() > 1) {
apic->send_IPI_allbutself(REBOOT_VECTOR);
- /* Don't wait longer than a second */
- wait = USEC_PER_SEC;
- while (num_online_cpus() > 1 && wait--)
+ /*
+ * Don't wait longer than a second if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
@@ -227,7 +230,7 @@ struct smp_ops smp_ops = {
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
+ .stop_other_cpus = native_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..083e99d1b7d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
@@ -299,23 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused)
* fragile that we want to limit the things done here to the
* most necessary things.
*/
+ cpu_init();
+ preempt_disable();
+ smp_callin();
#ifdef CONFIG_X86_32
- /*
- * Switch away from the trampoline page-table
- *
- * Do this before cpu_init() because it needs to access per-cpu
- * data which may not be mapped in the trampoline page-table.
- */
+ /* switch away from the initial page table */
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- vmi_bringup();
- cpu_init();
- preempt_disable();
- smp_callin();
-
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
@@ -324,9 +317,9 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
enable_NMI_through_LVT0();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
/* This must be done before setting cpu_online_mask */
@@ -397,6 +390,19 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+ struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+ struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+ cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+ cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+ cpumask_set_cpu(cpu1, c2->llc_shared_map);
+ cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
void __cpuinit set_cpu_sibling_map(int cpu)
{
@@ -409,14 +415,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
struct cpuinfo_x86 *o = &cpu_data(i);
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- cpumask_set_cpu(i, cpu_sibling_mask(cpu));
- cpumask_set_cpu(cpu, cpu_sibling_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, o->llc_shared_map);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->compute_unit_id == o->compute_unit_id)
+ link_thread_siblings(cpu, i);
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ link_thread_siblings(cpu, i);
}
}
} else {
@@ -742,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
- INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+ INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
@@ -774,7 +779,6 @@ do_rest:
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- initial_page_table = __pa(&trampoline_pg_dir);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
@@ -923,7 +927,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
err = do_boot_cpu(apicid, cpu);
-
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
@@ -1109,8 +1112,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
- enable_IR_x2apic();
- default_setup_apic_routing();
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1119,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
goto out;
}
+ default_setup_apic_routing();
+
preempt_disable();
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1370,7 +1373,6 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- irq_ctx_exit(raw_smp_processor_id());
c1e_remove_cpu(raw_smp_processor_id());
mb();
@@ -1383,11 +1385,88 @@ void play_dead_common(void)
local_irq_disable();
}
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ int i;
+ void *mwait_ptr;
+
+ if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+ return;
+ if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+ return;
+ if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return;
+
+ eax = CPUID_MWAIT_LEAF;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ /*
+ * This should be a memory location in a cache line which is
+ * unlikely to be touched by other processors. The actual
+ * content is immaterial as it is not actually modified in any way.
+ */
+ mwait_ptr = &current_thread_info()->flags;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ clflush(mwait_ptr);
+ __monitor(mwait_ptr, 0, 0);
+ mb();
+ __mwait(eax, 0);
+ }
+}
+
+static inline void hlt_play_dead(void)
+{
+ if (current_cpu_data.x86 >= 4)
+ wbinvd();
+
+ while (1) {
+ native_halt();
+ }
+}
+
void native_play_dead(void)
{
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- wbinvd_halt();
+
+ mwait_play_dead(); /* Only returns on failure */
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34..0b0cb5fede1 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
const char *const envp[])
{
long __res;
- asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+ asm volatile ("int $0x80"
: "=a" (__res)
- : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+ : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
return __res;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a59525739..a375616d77f 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,8 +1,8 @@
#include <linux/io.h>
+#include <linux/memblock.h>
#include <asm/trampoline.h>
#include <asm/pgtable.h>
-#include <asm/e820.h>
#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
#define __trampinit
@@ -17,15 +17,15 @@ unsigned char *__trampinitdata trampoline_base;
void __init reserve_trampoline_memory(void)
{
- unsigned long mem;
+ phys_addr_t mem;
/* Has to be in very low memory so we can execute real-mode AP code. */
- mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
- if (mem == -1L)
+ mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
panic("Cannot allocate trampoline\n");
trampoline_base = __va(mem);
- reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
+ memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
}
/*
@@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void)
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
-
-void __init setup_trampoline_page_table(void)
-{
-#ifdef CONFIG_X86_32
- /* Copy kernel address range */
- clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
-
- /* Initialize low mappings */
- clone_pgd_range(trampoline_pg_dir,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS,
- KERNEL_PGD_BOUNDARY));
-#endif
-}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8..cb838ca42c9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (regs->flags & X86_VM_MASK) {
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
+ preempt_conditional_cli(regs);
return;
}
@@ -776,21 +777,10 @@ asmlinkage void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
@@ -798,12 +788,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+ return;
}
-#else
- math_state_restore();
+#endif
+ math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+ conditional_sti(regs);
#endif
}
@@ -881,18 +871,6 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
- }
-
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a..0c40d8b7241 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int no_sched_irq_time;
+
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
return 1;
}
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
if (!tsc_unstable) {
tsc_unstable = 1;
sched_clock_stable = 0;
+ disable_sched_clock_irqtime();
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
void __init tsc_init(void)
{
u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
return;
}
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calibrate_cpu();
-
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f79..61fb9851962 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -551,8 +551,14 @@ cannot_handle:
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
- if ((trapno == 3) || (trapno == 1))
- return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+ if ((trapno == 3) || (trapno == 1)) {
+ KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+ /* setting this flag forces the code in entry_32.S to
+ call save_v86_state() and change the stack pointer
+ to KVM86->regs32 */
+ set_thread_flag(TIF_IRET);
+ return 0;
+ }
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
return 0;
}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb752..00000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- /*
- * Prevent page tables from being allocated in highmem, even if
- * CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi [deprecated]";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
- x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
- x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- x86_platform.calibrate_tsc = vmi_tsc_khz;
- x86_platform.get_wallclock = vmi_get_wallclock;
- x86_platform.set_wallclock = vmi_set_wallclock;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd7..00000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
- return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an absolute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa..e03530aebfd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -242,6 +242,12 @@ SECTIONS
__x86_cpu_dev_end = .;
}
+ /*
+ * start address and size of operations which during runtime
+ * can be patched with virtualization friendly instructions or
+ * baremetal native ones. Think page table operations.
+ * Details in paravirt_types.h
+ */
. = ALIGN(8);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
__parainstructions = .;
@@ -249,6 +255,11 @@ SECTIONS
__parainstructions_end = .;
}
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
. = ALIGN(8);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
__alt_instructions = .;
@@ -256,11 +267,28 @@ SECTIONS
__alt_instructions_end = .;
}
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
/*
+ * struct iommu_table_entry entries are injected in this section.
+ * It is an array of IOMMUs which during run time gets sorted depending
+ * on its dependency order. After rootfs_initcall is complete
+ * this section can be safely removed.
+ */
+ .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+ __iommu_table = .;
+ *(.iommu_table)
+ __iommu_table_end = .;
+ }
+ . = ALIGN(8);
+ /*
* .exit.text is discard at runtime, not link time, to deal with
* references from .altinstructions and .eh_frame
*/
@@ -273,7 +301,7 @@ SECTIONS
}
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU(PAGE_SIZE)
+ PERCPU(THREAD_SIZE)
#endif
. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3ec..ceb2911aa43 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/module.h>
+#include <linux/pci.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
#include <asm/pci_x86.h>
+#include <asm/pci.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
#include <asm/apic.h>
@@ -99,3 +101,8 @@ struct x86_platform_ops x86_platform = {
};
EXPORT_SYMBOL_GPL(x86_platform);
+struct x86_msi_ops x86_msi = {
+ .setup_msi_irqs = native_setup_msi_irqs,
+ .teardown_msi_irq = native_teardown_msi_irq,
+ .teardown_msi_irqs = default_teardown_msi_irqs,
+};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd47951..ddc131ff438 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ audit KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd..38b6e8dafaf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
@@ -51,13 +51,13 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstAcc (4<<1) /* Destination Accumulator */
#define DstDI (5<<1) /* Destination is in ES:(E)DI */
#define DstMem64 (6<<1) /* 64bit memory operand */
+#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
#define DstMask (7<<1)
/* Source operand type. */
#define SrcNone (0<<4) /* No source operand. */
-#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
#define SrcReg (1<<4) /* Register operand. */
#define SrcMem (2<<4) /* Memory operand. */
#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -71,6 +71,7 @@
#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
#define SrcAcc (0xd<<4) /* Source Accumulator */
+#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -82,8 +83,10 @@
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
-#define GroupMask 0xff /* Group number stored in bits 0:7 */
/* Misc flags */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
@@ -92,285 +95,30 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
+#define Src2Imm (4<<29)
#define Src2Mask (7<<29)
-enum {
- Group1_80, Group1_81, Group1_82, Group1_83,
- Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
- Group8, Group9,
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u32 flags;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ struct opcode *group;
+ struct group_dual *gdual;
+ } u;
};
-static u32 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- 0, 0,
- /* 0x40 - 0x47 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x48 - 0x4F */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x50 - 0x57 */
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- /* 0x58 - 0x5F */
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- /* 0x60 - 0x67 */
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
- SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x78 - 0x7F */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x80 - 0x87 */
- Group | Group1_80, Group | Group1_81,
- Group | Group1_82, Group | Group1_83,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
- ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
- /* 0x90 - 0x97 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x98 - 0x9F */
- 0, 0, SrcImmFAddr | No64, 0,
- ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
- ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
- ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
- /* 0xA8 - 0xAF */
- DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
- ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
- ByteOp | DstDI | String, DstDI | String,
- /* 0xB0 - 0xB7 */
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- /* 0xB8 - 0xBF */
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps | Stack, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack,
- ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- /* 0xE8 - 0xEF */
- SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
- /* 0xF8 - 0xFF */
- ImplicitOps, 0, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
-};
-
-static u32 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0,
- 0, ImplicitOps, ImplicitOps | Priv, 0,
- ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
- 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
- ImplicitOps, ImplicitOps | Priv, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM, 0, 0,
- /* 0xA8 - 0xAF */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM,
- ModRM, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0,
- Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
- 0, 0, 0, Group | GroupDual | Group9,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static u32 group_table[] = {
- [Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM,
- [Group1_81*8] =
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM,
- [Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64,
- [Group1_83*8] =
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM,
- [Group1A*8] =
- DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
- [Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group3*8] =
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
- 0, 0, 0, 0, 0, 0,
- [Group5*8] =
- DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
- SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
- SrcMem | ModRM | Stack, 0,
- [Group7*8] =
- 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
- [Group8*8] =
- 0, 0, 0, 0,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
- [Group9*8] =
- 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
-};
-
-static u32 group2_table[] = {
- [Group7*8] =
- SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, 0,
- [Group9*8] =
- 0, 0, 0, 0, 0, 0, 0, 0,
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
};
/* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
#define EFLG_PF (1<<2)
#define EFLG_CF (1<<0)
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
/*
* Instruction emulation:
* Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
#define ON64(x)
#endif
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "4", "2") \
_op _suffix " %"_x"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
+ : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
"=&r" (_tmp) \
: _y ((_src).val), "i" (EFLAGS_MASK)); \
} while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
\
switch ((_dst).bytes) { \
case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
break; \
case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
break; \
case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
break; \
} \
} while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
unsigned long _tmp; \
switch ((_dst).bytes) { \
case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
} \
} while (0)
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "1") \
+ _op _suffix " %5; " \
+ _POST_EFLAGS("0", "4", "1") \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "1") \
+ "1: \n\t" \
+ _op _suffix " %6; " \
+ "2: \n\t" \
+ _POST_EFLAGS("0", "5", "1") \
+ ".pushsection .fixup,\"ax\" \n\t" \
+ "3: movb $1, %4 \n\t" \
+ "jmp 2b \n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+ case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
+ case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+ } \
+ } while (0)
+
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "b", _ex); \
+ break; \
+ case 2: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "w", _ex); \
+ break; \
+ case 4: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "l", _ex); \
+ break; \
+ case 8: ON64( \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "q", _ex)); \
+ break; \
+ } \
+ } while (0)
+
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
ctxt->exception = vec;
ctxt->error_code = error;
ctxt->error_code_valid = valid;
- ctxt->restart = false;
}
static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
- int err)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
{
- ctxt->cr2 = addr;
- emulate_exception(ctxt, PF_VECTOR, err, true);
+ emulate_exception(ctxt, PF_VECTOR, 0, true);
}
static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
emulate_exception(ctxt, TS_VECTOR, err, true);
}
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_exception(ctxt, DE_VECTOR, 0, false);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, u8 *dest)
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- void *ptr,
+ ulong addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu, NULL);
+ rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu, NULL);
+ rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
return rc;
}
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
return (!!rc ^ (condition & 1));
}
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
static void decode_register_operand(struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op,
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->ptr = decode_register(reg, c->regs, highbyte_regs);
- op->val = *(u8 *)op->ptr;
+ op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
op->bytes = 1;
} else {
- op->ptr = decode_register(reg, c->regs, 0);
+ op->addr.reg = decode_register(reg, c->regs, 0);
op->bytes = c->op_bytes;
- switch (op->bytes) {
- case 2:
- op->val = *(u16 *)op->ptr;
- break;
- case 4:
- op->val = *(u32 *)op->ptr;
- break;
- case 8:
- op->val = *(u64 *) op->ptr;
- break;
- }
}
+ fetch_register_operand(op);
op->orig_val = op->val;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct x86_emulate_ops *ops,
+ struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
c->modrm_mod |= (c->modrm & 0xc0) >> 6;
c->modrm_reg |= (c->modrm & 0x38) >> 3;
c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_ea = 0;
- c->use_modrm_ea = 1;
+ c->modrm_seg = VCPU_SREG_DS;
if (c->modrm_mod == 3) {
- c->modrm_ptr = decode_register(c->modrm_rm,
+ op->type = OP_REG;
+ op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ op->addr.reg = decode_register(c->modrm_rm,
c->regs, c->d & ByteOp);
- c->modrm_val = *(unsigned long *)c->modrm_ptr;
+ fetch_register_operand(op);
return rc;
}
+ op->type = OP_MEM;
+
if (c->ad_bytes == 2) {
unsigned bx = c->regs[VCPU_REGS_RBX];
unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 6)
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, 2, c->eip);
break;
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, 2, c->eip);
break;
}
switch (c->modrm_rm) {
case 0:
- c->modrm_ea += bx + si;
+ modrm_ea += bx + si;
break;
case 1:
- c->modrm_ea += bx + di;
+ modrm_ea += bx + di;
break;
case 2:
- c->modrm_ea += bp + si;
+ modrm_ea += bp + si;
break;
case 3:
- c->modrm_ea += bp + di;
+ modrm_ea += bp + di;
break;
case 4:
- c->modrm_ea += si;
+ modrm_ea += si;
break;
case 5:
- c->modrm_ea += di;
+ modrm_ea += di;
break;
case 6:
if (c->modrm_mod != 0)
- c->modrm_ea += bp;
+ modrm_ea += bp;
break;
case 7:
- c->modrm_ea += bx;
+ modrm_ea += bx;
break;
}
if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
(c->modrm_rm == 6 && c->modrm_mod != 0))
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_SS);
- c->modrm_ea = (u16)c->modrm_ea;
+ c->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
scale = sib >> 6;
if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
else
- c->modrm_ea += c->regs[base_reg];
+ modrm_ea += c->regs[base_reg];
if (index_reg != 4)
- c->modrm_ea += c->regs[index_reg] << scale;
+ modrm_ea += c->regs[index_reg] << scale;
} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
c->rip_relative = 1;
} else
- c->modrm_ea += c->regs[c->modrm_rm];
+ modrm_ea += c->regs[c->modrm_rm];
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 5)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
break;
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
break;
}
}
+ op->addr.mem = modrm_ea;
done:
return rc;
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct x86_emulate_ops *ops,
+ struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
+ op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- c->modrm_ea = insn_fetch(u16, 2, c->eip);
+ op->addr.mem = insn_fetch(u16, 2, c->eip);
break;
case 4:
- c->modrm_ea = insn_fetch(u32, 4, c->eip);
+ op->addr.mem = insn_fetch(u32, 4, c->eip);
break;
case 8:
- c->modrm_ea = insn_fetch(u64, 8, c->eip);
+ op->addr.mem = insn_fetch(u64, 8, c->eip);
break;
}
done:
return rc;
}
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static void fetch_bit_operand(struct decode_cache *c)
{
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, group;
-
-
- /* we cannot decode insn before we complete previous rep insn */
- WARN_ON(ctxt->restart);
-
- c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
- ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
- case 0x66: /* operand-size override */
- /* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- c->rex_prefix = c->b;
- continue;
- case 0xf0: /* LOCK */
- c->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- c->rex_prefix = 0;
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if (c->rex_prefix)
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
-
- /* Opcode byte(s). */
- c->d = opcode_table[c->b];
- if (c->d == 0) {
- /* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- c->d = twobyte_table[c->b];
- }
- }
-
- if (c->d & Group) {
- group = c->d & GroupMask;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- group = (group << 3) + ((c->modrm >> 3) & 7);
- if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
- c->d = group2_table[group];
- else
- c->d = group_table[group];
- }
-
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
- }
-
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
-
- /* ModRM and SIB bytes. */
- if (c->d & ModRM)
- rc = decode_modrm(ctxt, ops);
- else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
-
- if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, ops, c);
-
- if (c->ad_bytes != 8)
- c->modrm_ea = (u32)c->modrm_ea;
-
- if (c->rip_relative)
- c->modrm_ea += c->eip;
-
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (c->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(&c->src, c, 0);
- break;
- case SrcMem16:
- c->src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- c->src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- c->src.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->src.type = OP_REG;
- c->src.val = c->modrm_val;
- c->src.ptr = c->modrm_ptr;
- break;
- }
- c->src.type = OP_MEM;
- c->src.ptr = (unsigned long *)c->modrm_ea;
- c->src.val = 0;
- break;
- case SrcImm:
- case SrcImmU:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
- if ((c->d & SrcMask) == SrcImmU) {
- switch (c->src.bytes) {
- case 1:
- c->src.val &= 0xff;
- break;
- case 2:
- c->src.val &= 0xffff;
- break;
- case 4:
- c->src.val &= 0xffffffff;
- break;
- }
- }
- break;
- case SrcImmByte:
- case SrcImmUByte:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = 1;
- if ((c->d & SrcMask) == SrcImmByte)
- c->src.val = insn_fetch(s8, 1, c->eip);
- else
- c->src.val = insn_fetch(u8, 1, c->eip);
- break;
- case SrcAcc:
- c->src.type = OP_REG;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->src.bytes) {
- case 1:
- c->src.val = *(u8 *)c->src.ptr;
- break;
- case 2:
- c->src.val = *(u16 *)c->src.ptr;
- break;
- case 4:
- c->src.val = *(u32 *)c->src.ptr;
- break;
- case 8:
- c->src.val = *(u64 *)c->src.ptr;
- break;
- }
- break;
- case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
- break;
- case SrcSI:
- c->src.type = OP_MEM;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)
- register_address(c, seg_override_base(ctxt, ops, c),
- c->regs[VCPU_REGS_RSI]);
- c->src.val = 0;
- break;
- case SrcImmFAddr:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = c->op_bytes + 2;
- insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
- break;
- case SrcMemFAddr:
- c->src.type = OP_MEM;
- c->src.ptr = (unsigned long *)c->modrm_ea;
- c->src.bytes = c->op_bytes + 2;
- break;
- }
+ long sv = 0, mask;
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
- */
- switch (c->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 1;
- c->src2.val = insn_fetch(u8, 1, c->eip);
- break;
- case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
- break;
- }
+ if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
+ mask = ~(c->dst.bytes * 8 - 1);
- /* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- return 0;
- case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
- break;
- case DstMem:
- case DstMem64:
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.type = OP_REG;
- c->dst.val = c->dst.orig_val = c->modrm_val;
- c->dst.ptr = c->modrm_ptr;
- break;
- }
- c->dst.type = OP_MEM;
- c->dst.ptr = (unsigned long *)c->modrm_ea;
- if ((c->d & DstMask) == DstMem64)
- c->dst.bytes = 8;
- else
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
+ if (c->src.bytes == 2)
+ sv = (s16)c->src.val & (s16)mask;
+ else if (c->src.bytes == 4)
+ sv = (s32)c->src.val & (s32)mask;
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- break;
- case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->dst.bytes) {
- case 1:
- c->dst.val = *(u8 *)c->dst.ptr;
- break;
- case 2:
- c->dst.val = *(u16 *)c->dst.ptr;
- break;
- case 4:
- c->dst.val = *(u32 *)c->dst.ptr;
- break;
- case 8:
- c->dst.val = *(u64 *)c->dst.ptr;
- break;
- }
- c->dst.orig_val = c->dst.val;
- break;
- case DstDI:
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)
- register_address(c, es_base(ctxt, ops),
- c->regs[VCPU_REGS_RDI]);
- c->dst.val = 0;
- break;
+ c->dst.addr.mem += (sv >> 3);
}
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
}
static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
ctxt->vcpu);
if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt, addr, err);
+ emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
addr = dt.address + index * 8;
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt, addr, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
addr = dt.address + index * 8;
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt, addr, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -1573,6 +1068,25 @@ exception:
return X86EMUL_PROPAGATE_FAULT;
}
+static void write_register_operand(struct operand *op)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (op->bytes) {
+ case 1:
+ *(u8 *)op->addr.reg = (u8)op->val;
+ break;
+ case 2:
+ *(u16 *)op->addr.reg = (u16)op->val;
+ break;
+ case 4:
+ *op->addr.reg = (u32)op->val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *op->addr.reg = op->val;
+ break;
+ }
+}
+
static inline int writeback(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
switch (c->dst.type) {
case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
+ write_register_operand(&c->dst);
break;
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
+ c->dst.addr.mem,
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu);
else
rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
+ c->dst.addr.mem,
&c->dst.val,
c->dst.bytes,
&err,
ctxt->vcpu);
if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt,
- (unsigned long)c->dst.ptr, err);
+ emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
+ c->regs[VCPU_REGS_RSP]);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt);
+
return rc;
}
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
return rc;
}
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+ u32 err;
+
+ /* TODO: Add limit checks */
+ c->src.val = ctxt->eflags;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+ c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = c->eip;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ ops->get_idt(&dt, ctxt->vcpu);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = eip;
+
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_int_real(ctxt, ops, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+ EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+ EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+ unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = temp_eip;
+
+
+ if (c->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (c->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+ return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops* ops)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt, ops);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
+ unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+ unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+ u8 de = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
case 3: /* neg */
emulate_1op("neg", c->dst, ctxt->eflags);
break;
+ case 4: /* mul */
+ emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 5: /* imul */
+ emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 6: /* div */
+ emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
+ case 7: /* idiv */
+ emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
default:
- return 0;
+ return X86EMUL_UNHANDLEABLE;
}
- return 1;
+ if (de)
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
return rc;
}
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, ops, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.val = c->src.val;
+ return rc;
+}
+
static inline void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
+ if (ctxt->perm_ok)
+ return true;
+
if (emulator_bad_iopl(ctxt, ops))
if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
return false;
+
+ ctxt->perm_ok = true;
+
return true;
}
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
}
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
}
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
}
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
int rc;
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
+ op->addr.mem = register_address(c, base, c->regs[reg]);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_push(ctxt, ctxt->ops);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = c->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ c->dst.val = al;
+ /* Set PF, ZF, SF */
+ c->src.type = OP_IMM;
+ c->src.val = 0;
+ c->src.bytes = 1;
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+
+ old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ old_eip = c->eip;
+
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+ if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+ return X86EMUL_CONTINUE;
+
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+ c->src.val = old_cs;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = old_eip;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &c->eip;
+ c->dst.bytes = c->op_bytes;
+ rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.val = c->src2.val;
+ return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->src.bytes;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+ c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
<