aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig33
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/Makefile6
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/eboot.c146
-rw-r--r--arch/x86/boot/compressed/eboot.h52
-rw-r--r--arch/x86/boot/compressed/head_32.S14
-rw-r--r--arch/x86/boot/compressed/head_64.S9
-rw-r--r--arch/x86/boot/header.S26
-rw-r--r--arch/x86/boot/tools/build.c37
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S29
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c14
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/ia32/ia32entry.S18
-rw-r--r--arch/x86/include/asm/bootparam_utils.h4
-rw-r--r--arch/x86/include/asm/checksum_32.h22
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/dma-contiguous.h1
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/espfix.h16
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/fpu-internal.h13
-rw-r--r--arch/x86/include/asm/hugetlb.h1
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/mce.h13
-rw-r--r--arch/x86/include/asm/mmu_context.h20
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/xen/page.h31
-rw-r--r--arch/x86/include/asm/xor_avx.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h6
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h1
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/sleep.c18
-rw-r--r--arch/x86/kernel/amd_nb.c13
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/common.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c21
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event.c15
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c4
-rw-r--r--arch/x86/kernel/devicetree.c3
-rw-r--r--arch/x86/kernel/e820.c5
-rw-r--r--arch/x86/kernel/early-quirks.c15
-rw-r--r--arch/x86/kernel/entry_32.S34
-rw-r--r--arch/x86/kernel/entry_64.S81
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ftrace.c95
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/i387.c17
-rw-r--r--arch/x86/kernel/ldt.c5
-rw-r--r--arch/x86/kernel/microcode_amd.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c11
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c16
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/setup.c25
-rw-r--r--arch/x86/kernel/signal.c11
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kernel/xsave.c7
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile13
-rw-r--r--arch/x86/kvm/cpuid.c57
-rw-r--r--arch/x86/kvm/cpuid.h5
-rw-r--r--arch/x86/kvm/emulate.c108
-rw-r--r--arch/x86/kvm/i8254.c20
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/lapic.c154
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c20
-rw-r--r--arch/x86/kvm/paging_tmpl.h28
-rw-r--r--arch/x86/kvm/svm.c18
-rw-r--r--arch/x86/kvm/vmx.c34
-rw-r--r--arch/x86/kvm/x86.c158
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/lib/csum-wrappers_64.c12
-rw-r--r--arch/x86/mm/dump_pagetables.c39
-rw-r--r--arch/x86/mm/fault.c57
-rw-r--r--arch/x86/mm/hugetlbpage.c187
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/mmap.c6
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/net/bpf_jit.S2
-rw-r--r--arch/x86/net/bpf_jit_comp.c14
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/platform/efi/efi.c169
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/syscalls/syscall_64.tbl6
-rw-r--r--arch/x86/xen/p2m.c10
-rw-r--r--arch/x86/xen/setup.c22
-rw-r--r--arch/x86/xen/smp.c20
-rw-r--r--arch/x86/xen/time.c17
117 files changed, 1679 insertions, 989 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe120da25625..3115eae96ad8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -121,6 +121,7 @@ config X86
select OLD_SIGACTION if X86_32
select COMPAT_OLD_SIGACTION if IA32_EMULATION
select RTC_LIB
+ select ARCH_SUPPORTS_ATOMIC_RMW
config INSTRUCTION_DECODER
def_bool y
@@ -207,6 +208,12 @@ config ARCH_HIBERNATION_POSSIBLE
config ARCH_SUSPEND_POSSIBLE
def_bool y
+config ARCH_WANT_HUGE_PMD_SHARE
+ def_bool y
+
+config ARCH_WANT_GENERAL_HUGETLB
+ def_bool y
+
config ZONE_DMA32
bool
default X86_64
@@ -951,10 +958,27 @@ config VM86
default y
depends on X86_32
---help---
- This option is required by programs like DOSEMU to run 16-bit legacy
- code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ This option is required by programs like DOSEMU to run
+ 16-bit real mode legacy code on x86 processors. It also may
+ be needed by software like XFree86 to initialize some video
+ cards via BIOS. Disabling this option saves about 6K.
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
+ default y
+ ---help---
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32
+
+config X86_ESPFIX64
+ def_bool y
+ depends on X86_16BIT && X86_64
config TOSHIBA
tristate "Toshiba Laptop support"
@@ -1560,6 +1584,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
depends on EFI
+ select RELOCATABLE
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5c477260294f..412189d2bff9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -31,6 +31,9 @@ ifeq ($(CONFIG_X86_32),y)
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
+ # Don't autogenerate MMX or SSE instructions
+ KBUILD_CFLAGS += -mno-mmx -mno-sse
+
# Never want PIC in a 32-bit kernel, prevent breakage with GCC built
# with nonstandard options
KBUILD_CFLAGS += -fno-pic
@@ -57,8 +60,11 @@ else
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
+ # Don't autogenerate MMX or SSE instructions
+ KBUILD_CFLAGS += -mno-mmx -mno-sse
+
# Use -mpreferred-stack-boundary=3 if supported.
- KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
+ KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 379814bc41e3..6cf0111783d3 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -53,18 +53,18 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
# How to compile the 16-bit code. Note we always compile for -march=i386,
# that way we can complain to the user if the CPU is insufficient.
-KBUILD_CFLAGS := $(USERINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
+KBUILD_CFLAGS := $(USERINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \
-DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes \
-march=i386 -mregparm=3 \
-include $(srctree)/$(src)/code16gcc.h \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
+ -mno-mmx -mno-sse \
$(call cc-option, -ffreestanding) \
$(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
+ $(call cc-option, -fno-unit-at-a-time)) \
$(call cc-option, -fno-stack-protector) \
$(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5ef205c5f37b..7194d9f094bc 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -12,6 +12,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
KBUILD_CFLAGS += $(cflags-y)
+KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c205035a6b96..17177e80d466 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -865,6 +865,9 @@ fail:
* Because the x86 boot code expects to be passed a boot_params we
* need to create one ourselves (usually the bootloader would create
* one for us).
+ *
+ * The caller is responsible for filling out ->code32_start in the
+ * returned boot_params.
*/
struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
{
@@ -875,14 +878,15 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
struct efi_info *efi;
efi_loaded_image_t *image;
void *options;
- u32 load_options_size;
efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
int options_size = 0;
efi_status_t status;
- unsigned long cmdline;
+ char *cmdline_ptr;
u16 *s2;
u8 *s1;
int i;
+ unsigned long ramdisk_addr;
+ unsigned long ramdisk_size;
sys_table = _table;
@@ -893,13 +897,14 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
status = efi_call_phys3(sys_table->boottime->handle_protocol,
handle, &proto, (void *)&image);
if (status != EFI_SUCCESS) {
- efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
+ efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
return NULL;
}
- status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+ status = efi_low_alloc(sys_table, 0x4000, 1,
+ (unsigned long *)&boot_params);
if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc lowmem for boot params\n");
+ efi_printk(sys_table, "Failed to alloc lowmem for boot params\n");
return NULL;
}
@@ -921,45 +926,13 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
hdr->vid_mode = 0xffff;
hdr->boot_flag = 0xAA55;
- hdr->code32_start = (__u64)(unsigned long)image->image_base;
-
hdr->type_of_loader = 0x21;
/* Convert unicode cmdline to ascii */
- options = image->load_options;
- load_options_size = image->load_options_size / 2; /* ASCII */
- cmdline = 0;
- s2 = (u16 *)options;
-
- if (s2) {
- while (*s2 && *s2 != '\n' && options_size < load_options_size) {
- s2++;
- options_size++;
- }
-
- if (options_size) {
- if (options_size > hdr->cmdline_size)
- options_size = hdr->cmdline_size;
-
- options_size++; /* NUL termination */
-
- status = low_alloc(options_size, 1, &cmdline);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for cmdline\n");
- goto fail;
- }
-
- s1 = (u8 *)(unsigned long)cmdline;
- s2 = (u16 *)options;
-
- for (i = 0; i < options_size - 1; i++)
- *s1++ = *s2++;
-
- *s1 = '\0';
- }
- }
-
- hdr->cmd_line_ptr = cmdline;
+ cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size);
+ if (!cmdline_ptr)
+ goto fail;
+ hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
hdr->ramdisk_image = 0;
hdr->ramdisk_size = 0;
@@ -969,16 +942,20 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
memset(sdt, 0, sizeof(*sdt));
- status = handle_ramdisks(image, hdr);
+ status = handle_cmdline_files(sys_table, image,
+ (char *)(unsigned long)hdr->cmd_line_ptr,
+ "initrd=", hdr->initrd_addr_max,
+ &ramdisk_addr, &ramdisk_size);
if (status != EFI_SUCCESS)
goto fail2;
+ hdr->ramdisk_image = ramdisk_addr;
+ hdr->ramdisk_size = ramdisk_size;
return boot_params;
fail2:
- if (options_size)
- low_free(options_size, hdr->cmd_line_ptr);
+ efi_free(sys_table, options_size, hdr->cmd_line_ptr);
fail:
- low_free(0x4000, (unsigned long)boot_params);
+ efi_free(sys_table, 0x4000, (unsigned long)boot_params);
return NULL;
}
@@ -992,22 +969,24 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
efi_memory_desc_t *mem_map;
efi_status_t status;
__u32 desc_version;
+ bool called_exit = false;
u8 nr_entries;
int i;
size = sizeof(*mem_map) * 32;
again:
- size += sizeof(*mem_map);
+ size += sizeof(*mem_map) * 2;
_size = size;
- status = low_alloc(size, 1, (unsigned long *)&mem_map);
+ status = efi_low_alloc(sys_table, size, 1, (unsigned long *)&mem_map);
if (status != EFI_SUCCESS)
return status;
+get_map:
status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
mem_map, &key, &desc_size, &desc_version);
if (status == EFI_BUFFER_TOO_SMALL) {
- low_free(_size, (unsigned long)mem_map);
+ efi_free(sys_table, _size, (unsigned long)mem_map);
goto again;
}
@@ -1029,8 +1008,20 @@ again:
/* Might as well exit boot services now */
status = efi_call_phys2(sys_table->boottime->exit_boot_services,
handle, key);
- if (status != EFI_SUCCESS)
- goto free_mem_map;
+ if (status != EFI_SUCCESS) {
+ /*
+ * ExitBootServices() will fail if any of the event
+ * handlers change the memory map. In which case, we
+ * must be prepared to retry, but only once so that
+ * we're guaranteed to exit on repeated failures instead
+ * of spinning forever.
+ */
+ if (called_exit)
+ goto free_mem_map;
+
+ called_exit = true;
+ goto get_map;
+ }
/* Historic? */
boot_params->alt_mem_k = 32 * 1024;
@@ -1097,44 +1088,10 @@ again:
return EFI_SUCCESS;
free_mem_map:
- low_free(_size, (unsigned long)mem_map);
+ efi_free(sys_table, _size, (unsigned long)mem_map);
return status;
}
-static efi_status_t relocate_kernel(struct setup_header *hdr)
-{
- unsigned long start, nr_pages;
- efi_status_t status;
-
- /*
- * The EFI firmware loader could have placed the kernel image
- * anywhere in memory, but the kernel has various restrictions
- * on the max physical address it can run at. Attempt to move
- * the kernel to boot_params.pref_address, or as low as
- * possible.
- */
- start = hdr->pref_address;
- nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-
- status = efi_call_phys4(sys_table->boottime->allocate_pages,
- EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
- nr_pages, &start);
- if (status != EFI_SUCCESS) {
- status = low_alloc(hdr->init_size, hdr->kernel_alignment,
- &start);
- if (status != EFI_SUCCESS)
- efi_printk("Failed to alloc mem for kernel\n");
- }
-
- if (status == EFI_SUCCESS)
- memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
- hdr->init_size);
-
- hdr->pref_address = hdr->code32_start;
- hdr->code32_start = (__u32)start;
-
- return status;
-}
/*
* On success we return a pointer to a boot_params structure, and NULL
@@ -1163,14 +1120,15 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
EFI_LOADER_DATA, sizeof(*gdt),
(void **)&gdt);
if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for gdt structure\n");
+ efi_printk(sys_table, "Failed to alloc mem for gdt structure\n");
goto fail;
}
gdt->size = 0x800;
- status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
+ status = efi_low_alloc(sys_table, gdt->size, 8,
+ (unsigned long *)&gdt->address);
if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for gdt\n");
+ efi_printk(sys_table, "Failed to alloc mem for gdt\n");
goto fail;
}
@@ -1178,7 +1136,7 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
EFI_LOADER_DATA, sizeof(*idt),
(void **)&idt);
if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for idt structure\n");
+ efi_printk(sys_table, "Failed to alloc mem for idt structure\n");
goto fail;
}
@@ -1190,10 +1148,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
* address, relocate it.
*/
if (hdr->pref_address != hdr->code32_start) {
- status = relocate_kernel(hdr);
-
+ unsigned long bzimage_addr = hdr->code32_start;
+ status = efi_relocate_kernel(sys_table, &bzimage_addr,
+ hdr->init_size, hdr->init_size,
+ hdr->pref_address,
+ hdr->kernel_alignment);
if (status != EFI_SUCCESS)
goto fail;
+
+ hdr->pref_address = hdr->code32_start;
+ hdr->code32_start = bzimage_addr;
}
status = exit_boot(boot_params, handle);
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index e5b0a8f91c5f..cc78e2da7af9 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -10,8 +10,6 @@
#define SEG_GRANULARITY_4KB (1 << 0)
#define DESC_TYPE_CODE_DATA (1 << 0)
-
-#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT)
#define EFI_READ_CHUNK_SIZE (1024 * 1024)
#define EFI_CONSOLE_OUT_DEVICE_GUID \
@@ -40,6 +38,24 @@ struct efi_graphics_output_mode_info {
u32 pixels_per_scan_line;
} __packed;
+struct efi_graphics_output_protocol_mode_32 {
+ u32 max_mode;
+ u32 mode;
+ u32 info;
+ u32 size_of_info;
+ u64 frame_buffer_base;
+ u32 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_64 {
+ u32 max_mode;
+ u32 mode;
+ u64 info;
+ u64 size_of_info;
+ u64 frame_buffer_base;
+ u64 frame_buffer_size;
+} __packed;
+
struct efi_graphics_output_protocol_mode {
u32 max_mode;
u32 mode;
@@ -49,6 +65,20 @@ struct efi_graphics_output_protocol_mode {
unsigned long frame_buffer_size;
} __packed;
+struct efi_graphics_output_protocol_32 {
+ u32 query_mode;
+ u32 set_mode;
+ u32 blt;
+ u32 mode;
+};
+
+struct efi_graphics_output_protocol_64 {
+ u64 query_mode;
+ u64 set_mode;
+ u64 blt;
+ u64 mode;
+};
+
struct efi_graphics_output_protocol {
void *query_mode;
unsigned long set_mode;
@@ -56,16 +86,22 @@ struct efi_graphics_output_protocol {
struct efi_graphics_output_protocol_mode *mode;
};
+struct efi_uga_draw_protocol_32 {
+ u32 get_mode;
+ u32 set_mode;
+ u32 blt;
+};
+
+struct efi_uga_draw_protocol_64 {
+ u64 get_mode;
+ u64 set_mode;
+ u64 blt;
+};
+
struct efi_uga_draw_protocol {
void *get_mode;
void *set_mode;
void *blt;
};
-struct efi_simple_text_output_protocol {
- void *reset;
- void *output_string;
- void *test_string;
-};
-
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1e3184f6072f..abb988a54c69 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -50,6 +50,13 @@ ENTRY(efi_pe_entry)
pushl %eax
pushl %esi
pushl %ecx
+
+ call reloc
+reloc:
+ popl %ecx
+ subl reloc, %ecx
+ movl %ecx, BP_code32_start(%eax)
+
sub $0x4, %esp
ENTRY(efi_stub_entry)
@@ -63,12 +70,7 @@ ENTRY(efi_stub_entry)
hlt
jmp 1b
2:
- call 3f
-3:
- popl %eax
- subl $3b, %eax
- subl BP_pref_address(%esi), %eax
- add BP_code32_start(%esi), %eax
+ movl BP_code32_start(%esi), %eax
leal preferred_addr(%eax), %eax
jmp *%eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 16f24e6dad79..92059b8f3f7b 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -217,6 +217,8 @@ ENTRY(efi_pe_entry)
cmpq $0,%rax
je 1f
mov %rax, %rdx
+ leaq startup_32(%rip), %rax
+ movl %eax, BP_code32_start(%rdx)
popq %rsi
popq %rdi
@@ -230,12 +232,7 @@ ENTRY(efi_stub_entry)
hlt
jmp 1b
2:
- call 3f
-3:
- popq %rax
- subq $3b, %rax
- subq BP_pref_address(%rsi), %rax
- add BP_code32_start(%esi), %eax
+ movl BP_code32_start(%esi), %eax
leaq preferred_addr(%rax), %rax
jmp *%rax
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 9ec06a1f6d61..425712462178 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0
#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 94c544650020..971a0ce062aa 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -141,7 +141,7 @@ static void usage(void)
#ifdef CONFIG_EFI_STUB
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -162,10 +162,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);
/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);
/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);
/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -177,6 +177,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -201,9 +206,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
pe_header = get_unaligned_le32(&buf[0x3c]);
- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -218,6 +220,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
#endif /* CONFIG_EFI_STUB */
@@ -268,6 +286,9 @@ int main(int argc, char ** argv)
int fd;
void *kernel;
u32 crc = 0xffffffffUL;
+#ifdef CONFIG_EFI_STUB
+ unsigned int init_sz;
+#endif
/* Defaults for old kernel */
#ifdef CONFIG_X86_32
@@ -338,7 +359,9 @@ int main(int argc, char ** argv)
put_unaligned_le32(sys_size, &buf[0x1f4]);
#ifdef CONFIG_EFI_STUB
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */
efi_stub_entry -= 0x200;
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 586f41aac361..185fad49d86f 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -24,10 +24,6 @@
.align 16
.Lbswap_mask:
.octa 0x000102030405060708090a0b0c0d0e0f
-.Lpoly:
- .octa 0xc2000000000000000000000000000001
-.Ltwo_one:
- .octa 0x00000001000000000000000000000001
#define DATA %xmm0
#define SHASH %xmm1
@@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update)
.Lupdate_just_ret:
ret
ENDPROC(clmul_ghash_update)
-
-/*
- * void clmul_ghash_setkey(be128 *shash, const u8 *key);
- *
- * Calculate hash_key << 1 mod poly
- */
-ENTRY(clmul_ghash_setkey)
- movaps .Lbswap_mask, BSWAP
- movups (%rsi), %xmm0
- PSHUFB_XMM BSWAP %xmm0
- movaps %xmm0, %xmm1
- psllq $1, %xmm0
- psrlq $63, %xmm1
- movaps %xmm1, %xmm2
- pslldq $8, %xmm1
- psrldq $8, %xmm2
- por %xmm1, %xmm0
- # reduction
- pshufd $0b00100100, %xmm2, %xmm1
- pcmpeqd .Ltwo_one, %xmm1
- pand .Lpoly, %xmm1
- pxor %xmm1, %xmm0
- movups %xmm0, (%rdi)
- ret
-ENDPROC(clmul_ghash_setkey)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 6759dd1135be..d785cf2c529c 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -30,8 +30,6 @@ void clmul_ghash_mul(char *dst, const be128 *shash);
void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
const be128 *shash);
-void clmul_ghash_setkey(be128 *shash, const u8 *key);
-
struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};
@@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ be128 *x = (be128 *)key;
+ u64 a, b;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
- clmul_ghash_setkey(&ctx->shash, key);
+ /* perform multiplication by 'x' in GF(2^128) */
+ a = be64_to_cpu(x->a);
+ b = be64_to_cpu(x->b);
+
+ ctx->shash.a = (__be64)((b << 1) | (a >> 63));
+ ctx->shash.b = (__be64)((a << 1) | (b >> 63));
+
+ if (a >> 63)
+ ctx->shash.b ^= cpu_to_be64(0xc2);
return 0;
}
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6cbd8df348d2..9f5e71f06671 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cf1a471a18a2..10adb41f162e 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -459,7 +459,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 474dc1b59f72..c9305ef1d411 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,16 @@ ENTRY(ia32_sysenter_target)
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
+
+ /*
+ * Sysenter doesn't filter flags, so we need to clear NT
+ * ourselves. To save a few cycles, we can check whether
+ * NT was set instead of doing an unconditional popfq.
+ */
+ testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+ jnz sysenter_fix_flags
+sysenter_flags_fixed:
+
orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
CFI_REMEMBER_STATE
@@ -184,6 +194,8 @@ sysexit_from_sys_call:
TRACE_IRQS_ON
ENABLE_INTERRUPTS_SYSEXIT32
+ CFI_RESTORE_STATE
+
#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
movl %esi,%r9d /* 6th arg: 4th syscall arg */
@@ -226,7 +238,6 @@ sysexit_from_sys_call:
.endm
sysenter_auditsys:
- CFI_RESTORE_STATE
auditsys_entry_common
movl %ebp,%r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
@@ -235,6 +246,11 @@ sysexit_audit:
auditsys_exit sysexit_from_sys_call
#endif
+sysenter_fix_flags:
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+ popfq_cfi
+ jmp sysenter_flags_fixed
+
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 653668d140f9..4a8cb8d7cbd5 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params)
*/
if (boot_params->sentinel) {
/* fields in boot_params are left uninitialized, clear them */
- memset(&boot_params->olpc_ofw_header, 0,
+ memset(&boot_params->ext_ramdisk_image, 0,
(char *)&boot_params->efi_info -
- (char *)&boot_params->olpc_ofw_header);
+ (char *)&boot_params->ext_ramdisk_image);
memset(&boot_params->kbd_status, 0,
(char *)&boot_params->hdr -
(char *)&boot_params->kbd_status);
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 46fc474fd819..f50de6951738 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src,
int len, __wsum sum,
int *err_ptr)
{
+ __wsum ret;
+
might_sleep();
- return csum_partial_copy_generic((__force void *)src, dst,
- len, sum, err_ptr, NULL);
+ stac();
+ ret = csum_partial_copy_generic((__force void *)src, dst,
+ len, sum, err_ptr, NULL);
+ clac();
+
+ return ret;
}
/*
@@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src,
int len, __wsum sum,
int *err_ptr)
{
+ __wsum ret;
+
might_sleep();
- if (access_ok(VERIFY_WRITE, dst, len))
- return csum_partial_copy_generic(src, (__force void *)dst,
- len, sum, NULL, err_ptr);
+ if (access_ok(VERIFY_WRITE, dst, len)) {
+ stac();
+ ret = csum_partial_copy_generic(src, (__force void *)dst,
+ len, sum, NULL, err_ptr);
+ clac();
+ return ret;
+ }
if (len)
*err_ptr = -EFAULT;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e99ac27f95b2..4af181dacf9e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -365,7 +365,7 @@ extern const char * const x86_power_flags[32];
static __always_inline __pure bool __static_cpu_has(u16 bit)
{
#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
- asm goto("1: jmp %l[t_no]\n"
+ asm_volatile_goto("1: jmp %l[t_no]\n"
"2:\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n"
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
index c09241659971..b4b38bacb404 100644
--- a/arch/x86/include/asm/dma-contiguous.h
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -4,7 +4,6 @@
#ifdef __KERNEL__
#include <linux/types.h>
-#include <asm-generic/dma-contiguous.h>
static inline void
dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index cccd07fa5e3a..779c2efe2e97 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -29,7 +29,7 @@ extern void e820_setup_gap(void);
extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
unsigned long start_addr, unsigned long long end_addr);
struct setup_data;
-extern void parse_e820_ext(struct setup_data *data);
+extern void parse_e820_ext(u64 phys_addr, u32 data_len);
#if defined(CONFIG_X86_64) || \
(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 60c89f30c727..7a76dc284166 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -94,7 +94,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
-extern unsigned long x86_efi_facility;
+extern struct efi_scratch efi_scratch;
extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9c999c1674fa..01f15b227d7e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -155,8 +155,9 @@ do { \
#define elf_check_arch(x) \
((x)->e_machine == EM_X86_64)
-#define compat_elf_check_arch(x) \
- (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64)
+#define compat_elf_check_arch(x) \
+ (elf_check_arch_ia32(x) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
#if __USER32_DS != __USER_DS
# error "The following code assumes __USER32_DS == __USER_DS"
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000000..99efebb2f69d
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(void);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 0dc7d9e21c34..9d7d36c82fc2 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -123,14 +123,14 @@ enum fixed_addresses {
__end_of_permanent_fixed_addresses,
/*
- * 256 temporary boot-time mappings, used by early_ioremap(),
+ * 512 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * If necessary we round it up to the next 256 pages boundary so
+ * If necessary we round it up to the next 512 pages boundary so
* that we can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
+#define FIX_BTMAPS_SLOTS 8
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
FIX_BTMAP_END =
(__end_of_permanent_fixed_addresses ^
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e25cc33ec54d..e72b2e41499e 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -295,12 +295,13 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. "m" is a random variable that should be in L1 */
- alternative_input(
- ASM_NOP8 ASM_NOP2,
- "emms\n\t" /* clear stack tags */
- "fildl %P[addr]", /* set F?P to defined value */
- X86_FEATURE_FXSAVE_LEAK,
- [addr] "m" (tsk->thread.fpu.has_fpu));
+ if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %P[addr]" /* set F?P to defined value */
+ : : [addr] "m" (tsk->thread.fpu.has_fpu));
+ }
return fpu_restore_checking(&tsk->thread.fpu);
}
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a8091216963b..68c05398bba9 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ ptep_clear_flush(vma, addr, ptep);
}
static inline int huge_pte_none(pte_t pte)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e624..0a8b519226b8 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 3a16c1483b45..029766958e69 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -13,7 +13,7 @@
static __always_inline bool arch_static_branch(struct static_key *key)
{
- asm goto("1:"
+ asm_volatile_goto("1:"
STATIC_KEY_INITIAL_NOP
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c653767c..b15a6b5fa341 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,6 +79,13 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
#define SELECTOR_TI_MASK (1 << 2)
#define SELECTOR_RPL_MASK 0x03
@@ -96,10 +103,6 @@
#define ASYNC_PF_PER_VCPU 64
-struct kvm_vcpu;
-struct kvm;
-struct kvm_async_pf;
-
enum kvm_reg {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
@@ -445,7 +448,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
int switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
@@ -463,6 +466,7 @@ struct kvm_vcpu_arch {
u64 mmio_gva;
unsigned access;
gfn_t mmio_gfn;
+ u64 mmio_gen;
struct kvm_pmu pmu;
@@ -631,8 +635,8 @@ struct msr_data {
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
- int (*hardware_enable)(void *dummy);
- void (*hardware_disable)(void *dummy);
+ int (*hardware_enable)(void);
+ void (*hardware_disable)(void);
void (*check_processor_compatibility)(void *rtn);
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
@@ -952,6 +956,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
+static inline u64 get_canonical(u64 la)
+{
+ return ((int64_t)la << 16) >> 16;
+}
+
+static inline bool is_noncanonical_address(u64 la)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la) != la;
+#else
+ return false;
+#endif
+}
+
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -1010,7 +1028,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_define_shared_msr(unsigned index, u32 msr);
-void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fa5f71e021d5..e6833c655e59 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -32,11 +32,20 @@
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-#define MCACOD 0xffff /* MCA Error Code */
+
+/*
+ * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
+ * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+ * errors to indicate that errors are being filtered by hardware.
+ * We should mask out bit 12 when looking for specific signatures
+ * of uncorrected errors - so the F bit is deliberately skipped
+ * in this #define.
+ */
+#define MCACOD 0xefff /* MCA Error Code */
/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
-#define MCACOD_SCRUBMSK 0xfff0
+#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
#define MCACOD_DATA 0x0134 /* Data Load */
#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index cdbf36776106..be12c534fd59 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Re-load page tables */
load_cr3(next->pgd);
- /* stop flush ipis for the previous mm */
+ /* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
- /*
- * load the LDT, if the LDT is different:
- */
+ /* Load the LDT, if the LDT is different: */
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
}
#ifdef CONFIG_SMP
- else {
+ else {
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
- /* We were in lazy tlb mode and leave_mm disabled
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * On established mms, the mm_cpumask is only changed
+ * from irq context, from ptep_clear_flush() while in
+ * lazy tlb mode, and here. Irqs are blocked during
+ * schedule, protecting us from simultaneous changes.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1e672234c4ff..5460bf923e16 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -415,9 +415,16 @@ static inline int pte_present(pte_t a)
}
#define pte_accessible pte_accessible
-static inline int pte_accessible(pte_t a)
+static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
- return pte_flags(a) & _PAGE_PRESENT;
+ if (pte_flags(a) & _PAGE_PRESENT)
+ return true;
+
+ if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+ mm_tlb_flush_pending(mm))
+ return true;
+
+ return false;
}
static inline int pte_hidden(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 2d883440cb9a..b1609f2c524c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
#define EARLY_DYNAMIC_PAGE_TABLES 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 942a08623a1a..68e9f007cd4a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -232,6 +232,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index b7bf3505e1ec..2e327f114a1b 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -62,6 +62,8 @@ static inline void x86_ce4100_early_setup(void) { }
#ifndef _SETUP
+#include <asm/espfix.h>
+
/*
* This is set up by the setup-routine at boot-time
*/
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692eaabab5..e3ddd7db723f 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -233,8 +233,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
-/* The {read|write|spin}_lock() on x86 are full memory barriers. */
-static inline void smp_mb__after_lock(void) { }
-#define ARCH_HAS_SMP_MB_AFTER_LOCK
-
#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 095b21507b6a..60bd2748a7c9 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,9 +119,10 @@ static inline void setup_node_to_cpumask_map(void) { }
extern const struct cpumask *cpu_coregroup_mask(int cpu);
-#ifdef ENABLE_TOPO_DEFINES
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+
+#ifdef ENABLE_TOPO_DEFINES
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 6aef9fbc09b7..b913915e8e63 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -79,30 +79,38 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
}
-static inline unsigned long mfn_to_pfn(unsigned long mfn)
+static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
{
unsigned long pfn;
- int ret = 0;
+ int ret;
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
- if (unlikely(mfn >= machine_to_phys_nr)) {
- pfn = ~0;
- goto try_override;
- }
- pfn = 0;
+ if (unlikely(mfn >= machine_to_phys_nr))
+ return ~0;
+
/*
* The array access can fail (e.g., device space beyond end of RAM).
* In such cases it doesn't matter what we return (we return garbage),
* but we must handle the fault without crashing!
*/
ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
-try_override:
- /* ret might be < 0 if there are no entries in the m2p for mfn */
if (ret < 0)
- pfn = ~0;
- else if (get_phys_to_machine(pfn) != mfn)
+ return ~0;
+
+ return pfn;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) != mfn) {
/*
* If this appears to be a foreign mfn (because the pfn
* doesn't map back to the mfn), then check the local override
@@ -111,6 +119,7 @@ try_override:
* m2p_find_override_pfn returns ~0 if it doesn't find anything.
*/
pfn = m2p_find_override_pfn(mfn, ~0);
+ }
/*
* pfn is ~0 if there are no entries in the m2p for mfn or if the
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7ea79c5fa1f2..492b29802f57 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = {
#define AVX_XOR_SPEED \
do { \
- if (cpu_has_avx) \
+ if (cpu_has_avx && cpu_has_osxsave) \
xor_speed(&xor_block_avx); \
} while (0)
#define AVX_SELECT(FASTEST) \
- (cpu_has_avx ? &xor_block_avx : FASTEST)
+ (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
#else
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5d9a3033b3d7..d3a87780c70b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 {
__u32 padding[3];
};
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2)
/* for KVM_SET_CPUID2 */
struct kvm_cpuid2 {
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 2af848dfa754..d3fd447ecbee 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -179,6 +179,7 @@
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
#define MSR_AMD64_OSVW_STATUS 0xc0010141
+#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 54991a746043..b16e6d28f149 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -6,7 +6,7 @@
* EFLAGS bits
*/
#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
-#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
+#define X86_EFLAGS_FIXED 0x00000002 /* Bit 1 - always on */
#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7bd3bd310106..111eb356dbea 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-y += syscall_$(BITS).o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index b44577bc9744..ec94e11807dc 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -48,9 +48,20 @@ int acpi_suspend_lowlevel(void)
#ifndef CONFIG_64BIT
native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
if (!rdmsr_safe(MSR_EFER,
&header->pmode_efer_low,
- &header->pmode_efer_high))
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
@@ -61,7 +72,10 @@ int acpi_suspend_lowlevel(void)
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
&header->pmode_misc_en_low,
- &header->pmode_misc_en_high))
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
header->pmode_behavior |=
(1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 3048ded1b598..59554dca96ec 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{}
};
@@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{}
};
@@ -81,13 +83,20 @@ int amd_cache_northbridges(void)
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, amd_nb_link_ids);
- }
+ }
+ /* GART present only on Fam15h upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
- boot_cpu_data.x86 == 0x15)
+ (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
amd_northbridges.flags |= AMD_NB_GART;
/*
+ * Check for L3 cache presence.
+ */
+ if (!cpuid_edx(0x80000006))
+ return 0;
+
+ /*
* Some CPU families support L3 Cache Index Disable. There are some
* limitations because of E382 and E388 on family 0x10.
*/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 904611bf0e5a..033eb44dc661 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1263,7 +1263,7 @@ void __cpuinit setup_local_APIC(void)
unsigned int value, queued;
int i, j, acked = 0;
unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz;
+ long long max_loops = cpu_khz ? cpu_khz : 1000000;
if (cpu_has_tsc)
rdtscll(tsc);
@@ -1360,7 +1360,7 @@ void __cpuinit setup_local_APIC(void)
break;
}
if (queued) {
- if (cpu_has_tsc) {
+ if (cpu_has_tsc && cpu_khz) {
rdtscll(ntsc);
max_loops = (cpu_khz << 10) - (ntsc - tsc);
} else
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 794f6eb54cd3..b32dbb411a9a 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -98,7 +98,7 @@ static int __init early_get_pnodeid(void)
break;
case UV3_HUB_PART_NUMBER:
case UV3_HUB_PART_NUMBER_X:
- uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1;
+ uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
break;
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5013a48d1aff..ae177a014180 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -508,6 +508,16 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf) {
+ u64 val;
+
+ rdmsrl(MSR_AMD64_LS_CFG, val);
+ if (!(val & BIT(15)))
+ wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15));
+ }
+
}
static const int amd_erratum_383[];
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22018f70a671..6a7e3e9cffc3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -284,8 +284,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
raw_local_save_flags(eflags);
BUG_ON(eflags & X86_EFLAGS_AC);
- if (cpu_has(c, X86_FEATURE_SMAP))
+ if (cpu_has(c, X86_FEATURE_SMAP)) {
+#ifdef CONFIG_X86_SMAP
set_in_cr4(X86_CR4_SMAP);
+#else
+ clear_in_cr4(X86_CR4_SMAP);
+#endif
+ }
}
/*
@@ -1129,7 +1134,7 @@ void syscall_init(void)
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC);
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
/*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9b0c441c03f5..8533e69d2b89 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -154,6 +154,21 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
+ * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * to be modified
+ */
+ if (c->x86 == 5 && c->x86_model == 9) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
}
#ifdef CONFIG_X86_32
@@ -387,7 +402,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
+ if (c->x86 == 6 && cpu_has_clflush &&
+ (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
@@ -627,7 +643,7 @@ static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
tlb_flushall_shift = 5;
break;
case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 1;
+ tlb_flushall_shift = 2;
break;
default:
tlb_flushall_shift = 6;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fa72a39e5d46..3982357de5b0 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -510,8 +510,9 @@ generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
- unsigned int mask_lo, mask_hi, base_lo, base_hi;
- unsigned int tmp, hi;
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
@@ -532,18 +533,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask_lo = size_or_mask | tmp;
+ tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
+ mask = size_or_mask | tmp;
/* Expand tmp with high bits to all 1s: */
- hi = fls(tmp);
+ hi = fls64(tmp);
if (hi > 0) {
- tmp |= ~((1<<(hi - 1)) - 1);
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
- if (tmp != mask_lo) {
+ if (tmp != mask) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
- mask_lo = tmp;
+ mask = tmp;
}
}
@@ -551,8 +552,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask_lo;
- *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *size = -mask;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
out_put_cpu:
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 726bf963c227..ca22b73aaa25 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -305,7 +305,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return -EINVAL;
}
- if (base & size_or_mask || size & size_or_mask) {
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
pr_warning("mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -583,6 +584,7 @@ static struct syscore_ops mtrr_syscore_ops = {
int __initdata changed_by_mtrr_cleanup;
+#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
*
@@ -600,7 +602,7 @@ void __init mtrr_bp_init(void)
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
- size_or_mask = 0xff000000; /* 36 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
phys_addr = 36;
@@ -619,7 +621,7 @@ void __init mtrr_bp_init(void)
boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
- size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+ size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
size_and_mask = ~size_or_mask & 0xfffff00000ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
boot_cpu_data.x86 == 6) {
@@ -627,7 +629,7 @@ void __init mtrr_bp_init(void)
* VIA C* family have Intel style MTRRs,
* but don't support PAE
*/
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
phys_addr = 32;
}
@@ -637,21 +639,21 @@ void __init mtrr_bp_init(void)
if (cpu_has_k6_mtrr) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_has_centaur_mcr) {
mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_has_cyrix_arr) {
mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1025f3c99d20..123d9e2271dc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1165,6 +1165,9 @@ static void x86_pmu_del(struct perf_event *event, int flags)
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i]) {
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
+
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, event);
@@ -1249,10 +1252,20 @@ void perf_events_lapic_init(void)
static int __kprobes
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ int ret;
+ u64 start_clock;
+ u64 finish_clock;
+
if (!atomic_read(&active_events))
return NMI_DONE;
- return x86_pmu.handle_irq(regs);
+ start_clock = local_clock();
+ ret = x86_pmu.handle_irq(regs);
+ finish_clock = local_clock();
+
+ perf_sample_event_took(finish_clock - start_clock);
+
+ return ret;
}
struct event_constraint emptyconstraint;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 5f0581e713c2..b46601ada813 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
#include <asm/apic.h>
@@ -816,6 +817,18 @@ out:
return ret;
}
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
static inline int get_ibs_lvt_offset(void)
{
u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+ clear_APIC_ibs(NULL);
+ return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
static int __cpuinit
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
- /*
- * Force LVT offset assignment for family 10h: The offsets are
- * not assigned by the BIOS for this family, so the OS is
- * responsible for doing it. If the OS assignment fails, fall
- * back to BIOS settings and try to setup this.
- */
- if (boot_cpu_data.x86 == 0x10)
- force_ibs_eilvt_setup();
+ ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
+ perf_ibs_pm_init();
get_online_cpus();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a9e22073bd56..6d6bb6f4fd43 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1199,6 +1199,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2163,6 +2172,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 52441a2af538..8aac56bda7dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -314,8 +314,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = {
static struct uncore_event_desc snbep_uncore_qpi_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
- INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
{ /* end: all zeroes */ },
};
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index b1581527a236..2fbad6b9f23c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
}
#ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
- unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
{
initrd_start = (unsigned long)__va(start);
initrd_end = (unsigned long)__va(end);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d32abeabbda5..174da5fc5a7b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -658,15 +658,18 @@ __init void e820_setup_gap(void)
* boot_params.e820_map, others are passed via SETUP_E820_EXT node of
* linked list of struct setup_data, which is parsed here.
*/
-void __init parse_e820_ext(struct setup_data *sdata)
+void __init parse_e820_ext(u64 phys_addr, u32 data_len)
{
int entries;
struct e820entry *extmap;
+ struct setup_data *sdata;
+ sdata = early_memremap(phys_addr, data_len);
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ early_iounmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 94ab6b90dd3f..4f7c82cdd0f5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -196,16 +196,21 @@ static void __init ati_bugs_contd(int num, int slot, int func)
static void __init intel_remapping_check(int num, int slot, int func)
{
u8 revision;
+ u16 device;
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
/*
- * Revision 0x13 of this chipset supports irq remapping
- * but has an erratum that breaks its behavior, flag it as such
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
*/
- if (revision == 0x13)
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
set_irq_remapping_broken();
-
}
#define QFLAG_APPLY_ONCE 0x1
@@ -239,6 +244,8 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
{ PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8f3e2dec1df3..5c38e2b298cd 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -434,8 +434,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -516,6 +517,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -530,6 +532,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -540,6 +543,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -552,13 +556,9 @@ ENTRY(iret_exc)
.previous
_ASM_EXTABLE(irq_return,iret_exc)
+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
@@ -600,6 +600,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)
@@ -690,8 +691,13 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
/*
@@ -707,6 +713,7 @@ END(syscall_badsys)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -716,8 +723,10 @@ END(syscall_badsys)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -728,6 +737,7 @@ END(syscall_badsys)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm
/*
@@ -1075,7 +1085,7 @@ ENTRY(ftrace_caller)
pushl $0 /* Pass NULL as regs pointer */
movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
- leal function_trace_op, %ecx
+ movl function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
@@ -1133,7 +1143,7 @@ ENTRY(ftrace_regs_caller)
movl 12*4(%esp), %eax /* Load ip (1st parameter) */
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
pushl %esp /* Save pt_regs as 4th parameter */
GLOBAL(ftrace_regs_call)
@@ -1335,11 +1345,13 @@ END(debug)
ENTRY(nmi)
RING0_INT_FRAME
ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1379,6 +1391,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1400,6 +1413,7 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 727208941030..8c6b5c2284c7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -58,6 +58,7 @@
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -88,7 +89,7 @@ END(function_hook)
MCOUNT_SAVE_FRAME \skip
/* Load the ftrace_ops into the 3rd parameter */
- leaq function_trace_op, %rdx
+ movq function_trace_op(%rip), %rdx
/* Load ip into the first parameter */
movq RIP(%rsp), %rdi
@@ -365,7 +366,7 @@ ENDPROC(native_usergs_sysret64)
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -1056,12 +1057,45 @@ restore_args:
irq_return:
INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return, bad_iret)
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -1127,9 +1161,40 @@ ENTRY(retint_kernel)
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $native_irq_return_iret,%rax
+ jne do_double_fault /* This shouldn't happen... */
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif
+
/*
* End of kprobes section
*/
@@ -1298,7 +1363,7 @@ zeroentry overflow do_overflow
zeroentry bounds do_bounds
zeroentry invalid_op do_invalid_op
zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
+paranoiderrorentry double_fault __do_double_fault
zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
errorentry invalid_TSS do_invalid_TSS
errorentry segment_not_present do_segment_not_present
@@ -1585,7 +1650,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..94d857fb1033
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 42a392a9fd02..1ffc32dbe450 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -77,8 +77,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
-static int
-do_ftrace_mod_code(unsigned long ip, const void *new_code)
+static unsigned long text_ip_addr(unsigned long ip)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
@@ -91,7 +90,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa_symbol(ip));
- return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ return ip;
}
static const unsigned char *ftrace_nop_replace(void)
@@ -123,8 +122,10 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
+ ip = text_ip_addr(ip);
+
/* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
+ if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
return -EPERM;
sync_core();
@@ -221,33 +222,56 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
return -EINVAL;
}
-int ftrace_update_ftrace_func(ftrace_func_t func)
+static unsigned long ftrace_update_func;
+
+static int update_ftrace_func(unsigned long ip, void *new)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ unsigned char old[MCOUNT_INSN_SIZE];
int ret;
- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, (unsigned long)func);
+ memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
+
+ ftrace_update_func = ip;
+ /* Make sure the breakpoints see the ftrace_update_func update */
+ smp_wmb();
/* See comment above by declaration of modifying_ftrace_code */
atomic_inc(&modifying_ftrace_code);
ret = ftrace_modify_code(ip, old, new);
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char *new;
+ int ret;
+
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+
/* Also update the regs callback function */
if (!ret) {
ip = (unsigned long)(&ftrace_regs_call);
- memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = ftrace_modify_code(ip, old, new);
+ ret = update_ftrace_func(ip, new);
}
- atomic_dec(&modifying_ftrace_code);
-
return ret;
}
+static int is_ftrace_caller(unsigned long ip)
+{
+ if (ip == ftrace_update_func)
+ return 1;
+
+ return 0;
+}
+
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -257,10 +281,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
+ unsigned long ip;
+
if (WARN_ON_ONCE(!regs))
return 0;
- if (!ftrace_location(regs->ip - 1))
+ ip = regs->ip - 1;
+ if (!ftrace_location(ip) && !is_ftrace_caller(ip))
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
@@ -632,8 +659,8 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
ret = -EPERM;
goto out;
}
- run_sync();
out:
+ run_sync();
return ret;
fail_update:
@@ -665,45 +692,41 @@ int __init ftrace_dyn_arch_init(void *data)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- unsigned char code[MCOUNT_INSN_SIZE];
+ static union ftrace_code_union calc;
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ /* Jmp not a call (ignore the .e8) */
+ calc.e8 = 0xe9;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
+ /*
+ * ftrace external locks synchronize the access to the static variable.
+ */
+ return calc.code;
+}
- *(int *)(&code[1]) = new_offset;
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ unsigned char *new;
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
- return 0;
+ return update_ftrace_func(ip, new);
}
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
-
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
}
int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_stub);
}
#endif /* !CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 73afd11799ca..df63cae573e0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -566,6 +566,10 @@ ENDPROC(early_idt_handlers)
/* This is global to keep gas from relaxing the jumps */
ENTRY(early_idt_handler)
cld
+
+ cmpl $2,(%esp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
+
cmpl $2,%ss:early_recursion_flag
je hlt_loop
incl %ss:early_recursion_flag
@@ -616,8 +620,9 @@ ex_entry:
pop %edx
pop %ecx
pop %eax
- addl $8,%esp /* drop vector number and error code */
decl %ss:early_recursion_flag
+is_nmi:
+ addl $8,%esp /* drop vector number and error code */
iret
ENDPROC(early_idt_handler)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 321d65ebaffe..f2a9a2aa98f3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -343,6 +343,9 @@ early_idt_handlers:
ENTRY(early_idt_handler)
cld
+ cmpl $2,(%rsp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
+
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
@@ -405,8 +408,9 @@ ENTRY(early_idt_handler)
popq %rdx
popq %rcx
popq %rax
- addq $16,%rsp # drop vector number and error code
decl early_recursion_flag(%rip)
+is_nmi:
+ addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
ENDPROC(early_idt_handler)
@@ -513,7 +517,7 @@ ENTRY(phys_base)
#include "../../x86/xen/xen-head.S"
.section .bss, "aw", @nobits
- .align L1_CACHE_BYTES
+ .align PAGE_SIZE
ENTRY(idt_table)
.skip IDT_ENTRIES * 16
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index cb339097b9ea..b03ff1842547 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
void __kernel_fpu_end(void)
{
- if (use_eager_fpu())
- math_state_restore();
- else
+ if (use_eager_fpu()) {
+ /*
+ * For eager fpu, most the time, tsk_used_math() is true.
+ * Restore the user math as we are done with the kernel usage.
+ * At few instances during thread exit, signal handling etc,
+ * tsk_used_math() is false. Those few places will take proper
+ * actions, so we don't need to restore the math here.
+ */
+ if (likely(tsk_used_math(current)))
+ math_state_restore();
+ } else {
stts();
+ }
}
EXPORT_SYMBOL(__kernel_fpu_end);
@@ -116,7 +125,7 @@ static void __cpuinit mxcsr_feature_mask_init(void)
if (cpu_has_fxsr) {
memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
+ asm volatile("fxsave %0" : "+m" (fx_scratch));
mask = fx_scratch.mxcsr_mask;
if (mask == 0)
mask = 0x0000ffbf;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc987398923..c37886d759cc 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index efdec7cd8e01..b516dfb411ec 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -430,7 +430,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
if (request_firmware(&fw, (const char *)fw_name, device)) {
- pr_err("failed to load file %s\n", fw_name);
+ pr_debug("failed to load file %s\n", fw_name);
goto out;
}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 872079a67e4d..f7d0672481fd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -100,8 +100,10 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
flag |= __GFP_ZERO;
again:
page = NULL;
- if (!(flag & GFP_ATOMIC))
+ /* CMA can be used only in the context which permits sleeping */
+ if (flag & __GFP_WAIT)
page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ /* fallback */
if (!page)
page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
if (!page)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 81a5f5e8f142..59b90379cb6a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -391,9 +391,9 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be
* called with interrupts disabled.
*/
- local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- local_irq_enable();
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
} else
default_idle();
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7305f7dfc7ab..0339f5c14bf9 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -147,7 +147,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->bp = arg;
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
p->fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 355ae06dbf94..f99a242730e9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -176,7 +176,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->bp = arg;
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
return 0;
}
*childregs = *current_pt_regs();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 29a8120e6fe8..baa61e7370b7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1475,15 +1475,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
force_sig_info(SIGTRAP, &info, tsk);
}
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
-#endif
-
/*
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
@@ -1521,7 +1512,7 @@ long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
- if (IS_IA32)
+ if (is_ia32_task())
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_ax,
regs->bx, regs->cx,
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 04ee1e2e4c02..52dbf1e400dc 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- node = val & 7;
+ node = pcibus_to_node(dev->bus) | (val & 7);
/*
* Some hardware may return an invalid node ID,
* so check it first:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 76fa1e9a2b39..90fd1195f276 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -447,6 +447,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
},
},
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
+ { /* Some C6100 machines were shipped with vendor being 'Dell'. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819bb6a8..80eab01c1a68 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 56f7fcfe7fa2..a3627ade4b15 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -426,25 +426,23 @@ static void __init reserve_initrd(void)
static void __init parse_setup_data(void)
{
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- u32 data_len, map_len;
+ u32 data_len, map_len, data_type;
map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
(u64)sizeof(struct setup_data));
data = early_memremap(pa_data, map_len);
data_len = data->len + sizeof(struct setup_data);
- if (data_len > map_len) {
- early_iounmap(data, map_len);
- data = early_memremap(pa_data, data_len);
- map_len = data_len;
- }
+ data_type = data->type;
+ pa_next = data->next;
+ early_iounmap(data, map_len);
- switch (data->type) {
+ switch (data_type) {
case SETUP_E820_EXT:
- parse_e820_ext(data);
+ parse_e820_ext(pa_data, data_len);
break;
case SETUP_DTB:
add_dtb(pa_data);
@@ -452,8 +450,7 @@ static void __init parse_setup_data(void)
default:
break;
}
- pa_data = data->next;
- early_iounmap(data, map_len);
+ pa_data = pa_next;
}
}
@@ -911,11 +908,11 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL32", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL64", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
- set_bit(EFI_64BIT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
}
if (efi_enabled(EFI_BOOT))
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 69562992e457..66deef41512f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -364,7 +364,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. */
restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -429,7 +429,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
@@ -496,7 +496,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
put_user_ex(0, &frame->uc.uc__pad0);
if (ksig->ka.sa.sa_flags & SA_RESTORER) {
@@ -677,6 +677,11 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (used_math())
+ drop_init_fpu(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bfd348e99369..87084ab90d19 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -265,6 +265,13 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
/*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -1277,6 +1284,9 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_sibling_mask(cpu))
cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_sibling_mask(cpu));
cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5aedb81..30277e27431a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = new_begin;
}
} else {
- *begin = TASK_UNMAPPED_BASE;
+ *begin = current->mm->mmap_legacy_base;
*end = TASK_SIZE;
}
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 098b3cfda72e..4e27ba53c40c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -968,14 +968,17 @@ void __init tsc_init(void)
x86_init.timers.tsc_pre_init();
- if (!cpu_has_tsc)
+ if (!cpu_has_tsc) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
+ }
tsc_khz = x86_platform.calibrate_tsc();
cpu_khz = tsc_khz;
if (!tsc_khz) {
mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9a907a67be8f..c52c07efe970 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -125,10 +125,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index ada87a329edc..1ee723298e90 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -268,8 +268,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
return -1;
- drop_init_fpu(tsk); /* trigger finit */
-
return 0;
}
@@ -400,8 +398,11 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
set_used_math();
}
- if (use_eager_fpu())
+ if (use_eager_fpu()) {
+ preempt_disable();
math_state_restore();
+ preempt_enable();
+ }
return err;
} else {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a47a3e54b964..bdccfb62aa0d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
@@ -38,6 +39,7 @@ config KVM
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select KVM_VFIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d84048..25d22b2d6509 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
CFLAGS_vmx.o := -I.
-kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o eventfd.o \
- irqchip.o)
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \
- assigned-dev.o iommu.o)
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
+KVM := ../../../virt/kvm
+
+kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
+ $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o cpuid.o pmu.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..89d288237b9c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -187,8 +187,14 @@ static bool supported_xcr0_bit(unsigned bit)
#define F(x) bit(X86_FEATURE_##x)
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index, int *nent, int maxnent)
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
+ u32 func, u32 index, int *nent, int maxnent)
+{
+ return 0;
+}
+
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
{
int r;
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -480,6 +486,15 @@ out:
return r;
}
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+ u32 idx, int *nent, int maxnent, unsigned int type)
+{
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+
+ return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
+
#undef F
struct kvm_cpuid_param {
@@ -494,8 +509,34 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
}
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (entries[i].padding[0] ||
+ entries[i].padding[1] ||
+ entries[i].padding[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
{
struct kvm_cpuid_entry2 *cpuid_entries;
int limit, nent = 0, r = -E2BIG, i;
@@ -512,6 +553,10 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
goto out;
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
r = -ENOMEM;
cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
if (!cpuid_entries)
@@ -525,7 +570,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
continue;
r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
- &nent, cpuid->nent);
+ &nent, cpuid->nent, type);
if (r)
goto out_free;
@@ -536,7 +581,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
limit = cpuid_entries[nent - 1].eax;
for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
- &nent, cpuid->nent);
+ &nent, cpuid->nent, type);
if (r)
goto out_free;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b7fd07984888..f1e4895174b2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,8 +6,9 @@
void kvm_update_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
struct kvm_cpuid_entry __user *entries);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dcea752d..4c01f022c6ac 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -663,11 +663,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
}
-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- register_address_increment(ctxt, &ctxt->_eip, rel);
-}
-
static u32 desc_limit_scaled(struct desc_struct *desc)
{
u32 limit = get_desc_limit(desc);
@@ -741,6 +736,38 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, NM_VECTOR, 0, false);
}
+static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ int cs_l)
+{
+ switch (ctxt->op_bytes) {
+ case 2:
+ ctxt->_eip = (u16)dst;
+ break;
+ case 4:
+ ctxt->_eip = (u32)dst;
+ break;
+ case 8:
+ if ((cs_l && is_noncanonical_address(dst)) ||
+ (!cs_l && (dst & ~(u32)-1)))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = dst;
+ break;
+ default:
+ WARN(1, "unsupported eip assignment size\n");
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
{
u16 selector;
@@ -2161,13 +2188,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
case 2: /* call near abs */ {
long int old_eip;
old_eip = ctxt->_eip;
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ break;
ctxt->src.val = old_eip;
rc = em_push(ctxt);
break;
}
case 4: /* jmp abs */
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
break;
case 5: /* jmp far */
rc = em_jmp_far(ctxt);
@@ -2199,16 +2228,21 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- return em_pop(ctxt);
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
}
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long cs;
+ int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2218,6 +2252,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -2465,7 +2502,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
- u64 msr_data;
+ u64 msr_data, rcx, rdx;
int usermode;
u16 cs_sel = 0, ss_sel = 0;
@@ -2481,6 +2518,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
else
usermode = X86EMUL_MODE_PROT32;
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
cs.dpl = 3;
ss.dpl = 3;
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
@@ -2498,6 +2538,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
+ if (is_noncanonical_address(rcx) ||
+ is_noncanonical_address(rdx))
+ return emulate_gp(ctxt, 0);
break;
}
cs_sel |= SELECTOR_RPL_MASK;
@@ -2506,8 +2549,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
- *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->_eip = rdx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
}
@@ -3046,10 +3089,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt)
static int em_call(struct x86_emulate_ctxt *ctxt)
{
+ int rc;
long rel = ctxt->src.val;
ctxt->src.val = (unsigned long)ctxt->_eip;
- jmp_rel(ctxt, rel);
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
return em_push(ctxt);
}
@@ -3081,11 +3127,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ unsigned long eip;
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
if (rc != X86EMUL_CONTINUE)
return rc;
rsp_increment(ctxt, ctxt->src.val);
@@ -3375,20 +3422,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
static int em_loop(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_jcxz(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_in(struct x86_emulate_ctxt *ctxt)
@@ -4207,7 +4258,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
case OpMem8:
ctxt->memop.bytes = 1;
if (ctxt->memop.type == OP_REG) {
- ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+ int highbyte_regs = ctxt->rex_prefix == 0;
+
+ ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm,
+ highbyte_regs);
fetch_register_operand(&ctxt->memop);
}
goto mem_common;
@@ -4714,7 +4768,7 @@ special_insn:
break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x8d: /* lea r16/r32, m */
ctxt->dst.val = ctxt->src.addr.mem.ea;
@@ -4743,7 +4797,7 @@ special_insn:
break;
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -4855,7 +4909,7 @@ twobyte_insn:
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 412a5aa0ef94..298781d4cfb4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -37,6 +37,7 @@
#include "irq.h"
#include "i8254.h"
+#include "x86.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -261,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
return;
timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
}
static void destroy_pit_timer(struct kvm_pit *pit)
@@ -349,6 +352,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "kvm: requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
+
hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 484bc874688b..3ec38cb56bd5 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
vector = kvm_cpu_get_extint(v);
- if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8b64d1..681e4e251f00 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -71,9 +71,6 @@
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
-static unsigned int min_timer_period_us = 500;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-
static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
*((u32 *) (apic->regs + reg_off)) = val;
@@ -153,6 +150,8 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
+#define KVM_X2APIC_CID_BITS 0
+
static void recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -190,7 +189,8 @@ static void recalculate_apic_map(struct kvm *kvm)
if (apic_x2apic_mode(apic)) {
new->ldr_bits = 32;
new->cid_shift = 16;
- new->cid_mask = new->lid_mask = 0xffff;
+ new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
+ new->lid_mask = 0xffff;
} else if (kvm_apic_sw_enabled(apic) &&
!new->cid_mask /* flat mode */ &&
kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
@@ -362,31 +362,90 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ /* try to update RVI */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else {
+ vec = apic_search_irr(apic);
+ apic->irr_pending = (vec != -1);
+ }
}
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
+ else {
++apic->isr_count;
- BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
/*
- * ISR (in service register) bit is set when injecting an interrupt.
- * The highest vector is injected. Thus the latest bit set matches
- * the highest bit in ISR.
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
*/
- apic->highest_isr_cache = vec;
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
}
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ struct kvm_vcpu *vcpu;
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
+ else {
--apic->isr_count;
- BUG_ON(apic->isr_count < 0);
- apic->highest_isr_cache = -1;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
}
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
@@ -466,22 +525,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
- int result;
-
- /* Note that isr_count is always 1 with vid enabled */
- if (!apic->isr_count)
- return -1;
- if (likely(apic->highest_isr_cache != -1))
- return apic->highest_isr_cache;
-
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -855,7 +898,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_apic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
return 0;
remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -1360,8 +1404,12 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
return;
}
+ if (!kvm_vcpu_is_bsp(apic->vcpu))
+ value &= ~MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base = value;
+
/* update jump label if enable bit changes */
- if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE)
static_key_slow_dec_deferred(&apic_hw_disabled);
else
@@ -1369,10 +1417,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
recalculate_apic_map(vcpu->kvm);
}
- if (!kvm_vcpu_is_bsp(apic->vcpu))
- value &= ~MSR_IA32_APICBASE_BSP;
-
- vcpu->arch.apic_base = value;
if ((old_value ^ value) & X2APIC_ENABLE) {
if (value & X2APIC_ENABLE) {
u32 id = kvm_apic_id(apic);
@@ -1621,6 +1665,13 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
if (vector == -1)
return -1;
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1705,7 +1756,6 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
- void *vapic;
if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
@@ -1713,9 +1763,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
- data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
- kunmap_atomic(vapic);
+ kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
@@ -1751,7 +1800,6 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
u32 data, tpr;
int max_irr, max_isr;
struct kvm_lapic *apic = vcpu->arch.apic;
- void *vapic;
apic_sync_pv_eoi_to_guest(vcpu, apic);
@@ -1767,18 +1815,24 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
- *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
- kunmap_atomic(vapic);
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
- vcpu->arch.apic->vapic_addr = vapic_addr;
- if (vapic_addr)
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
- else
+ } else {
__clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
}
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c730ac9fe801..c8b0d0d2da5c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,7 +34,7 @@ struct kvm_lapic {
*/
void *regs;
gpa_t vapic_addr;
- struct page *vapic_page;
+ struct gfn_to_hva_cache vapic_cache;
unsigned long pending_events;
unsigned int sipi_vector;
};
@@ -76,7 +76,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0cc34f594c2a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2585,6 +2585,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int emulate = 0;
gfn_t pseudo_gfn;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return 0;
+
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
@@ -2748,6 +2751,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
bool ret = false;
u64 spte = 0ull;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return false;
+
if (!page_fault_can_be_fast(vcpu, error_code))
return false;
@@ -3066,7 +3072,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -3139,6 +3145,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
struct kvm_shadow_walk_iterator iterator;
u64 spte = 0ull;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return spte;
+
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
if (!is_shadow_present_pte(spte))
@@ -3232,7 +3241,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+ return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -4220,7 +4229,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (nr_to_scan == 0)
goto out;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
@@ -4256,7 +4265,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
out:
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
@@ -4329,6 +4338,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
u64 spte;
int nr_sptes = 0;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return nr_sptes;
+
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
sptes[iterator.level-1] = spte;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860b457a..a6a6370c2010 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -69,6 +69,7 @@ struct guest_walker {
pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
@@ -130,6 +131,22 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (pte == orig_pte)
continue;
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
if (ret)
return ret;
@@ -204,7 +221,8 @@ retry_walk:
goto error;
real_gfn = gpa_to_gfn(real_gfn);
- host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+ host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+ &walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -423,6 +441,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto out_gpte_changed;
+
for (shadow_walk_init(&it, vcpu, addr);
shadow_walk_okay(&it) && it.level > gw->level;
shadow_walk_next(&it)) {
@@ -671,6 +692,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
*/
mmu_topup_memory_caches(vcpu);
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ WARN_ON(1);
+ return;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, gva, iterator) {
level = iterator.level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6eaf871d..03f7d03c92a2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -606,7 +606,7 @@ static int has_svm(void)
return 1;
}
-static void svm_hardware_disable(void *garbage)
+static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
@@ -617,7 +617,7 @@ static void svm_hardware_disable(void *garbage)
amd_pmu_disable_virt();
}
-static int svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void)
{
struct svm_cpu_data *sd;
@@ -2985,10 +2985,8 @@ static int cr8_write_interception(struct vcpu_svm *svm)
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(svm);
- if (irqchip_in_kernel(svm->vcpu.kvm)) {
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ if (irqchip_in_kernel(svm->vcpu.kvm))
return r;
- }
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return r;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
@@ -3198,7 +3196,7 @@ static int wrmsr_interception(struct vcpu_svm *svm)
msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, &msr)) {
+ if (kvm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
} else {
@@ -3480,9 +3478,9 @@ static int handle_exit(struct kvm_vcpu *vcpu)
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
return svm_exit_handlers[exit_code](svm);
@@ -3550,6 +3548,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
if (irr == -1)
return;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a91939555..a3476bedd201 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2493,12 +2493,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
- kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
}
break;
}
@@ -2566,7 +2569,7 @@ static void kvm_cpu_vmxon(u64 addr)
: "memory", "cc");
}
-static int hardware_enable(void *garbage)
+static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2630,7 +2633,7 @@ static void kvm_cpu_vmxoff(void)
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
}
-static void hardware_disable(void *garbage)
+static void hardware_disable(void)
{
if (vmm_exclusive) {
vmclear_local_loaded_vmcss();
@@ -3399,15 +3402,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
var->avl = (ar >> 12) & 1;
var->l = (ar >> 13) & 1;
var->db = (ar >> 14) & 1;
var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
}
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -5055,7 +5065,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
msr.data = data;
msr.index = ecx;
msr.host_initiated = false;
- if (vmx_set_msr(vcpu, &msr) != 0) {
+ if (kvm_set_msr(vcpu, &msr) != 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -6644,10 +6654,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = exit_reason;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
- return 0;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -7126,8 +7136,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- free_nested(vmx);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_nested(vmx);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7942,7 +7952,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
- vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
/*
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
* actually changed, because it depends on the current state of
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8ba99c34180..ce20cb65de58 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
@@ -222,24 +225,29 @@ static void kvm_shared_msr_cpu_online(void)
shared_msr_update(i, shared_msrs_global.msrs[i]);
}
-void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int err;
if (((value ^ smsr->values[slot].curr) & mask) == 0)
- return;
+ return 0;
smsr->values[slot].curr = value;
- wrmsrl(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ if (err)
+ return 1;
+
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
smsr->registered = true;
}
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
-static void drop_user_return_notifiers(void *ignore)
+static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
@@ -917,7 +925,6 @@ void kvm_enable_efer_bits(u64 mask)
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
-
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -925,8 +932,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
+ switch (msr->index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(msr->data))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ msr->data = get_canonical(msr->data);
+ }
return kvm_x86_ops->set_msr(vcpu, msr);
}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
@@ -1193,20 +1226,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
+ int faulted = 0;
+
/* n.b - signed multiplication and division required */
usdiff = data - kvm->arch.last_tsc_write;
#ifdef CONFIG_X86_64
usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
#else
/* do_div() only does unsigned */
- asm("idivl %2; xor %%edx, %%edx"
- : "=A"(usdiff)
- : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+ asm("1: idivl %[divisor]\n"
+ "2: xor %%edx, %%edx\n"
+ " movl $0, %[faulted]\n"
+ "3:\n"
+ ".section .fixup,\"ax\"\n"
+ "4: movl $1, %[faulted]\n"
+ " jmp 3b\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 4b)
+
+ : "=A"(usdiff), [faulted] "=r" (faulted)
+ : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
+
#endif
do_div(elapsed, 1000);
usdiff -= elapsed;
if (usdiff < 0)
usdiff = -usdiff;
+
+ /* idivl overflow => difference is larger than USEC_PER_SEC */
+ if (faulted)
+ usdiff = USEC_PER_SEC;
} else
usdiff = USEC_PER_SEC; /* disable TSC match window below */
@@ -2500,7 +2550,7 @@ out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -2510,6 +2560,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
case KVM_CAP_PIT:
case KVM_CAP_NOP_IO_DELAY:
@@ -2619,15 +2670,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
- case KVM_GET_SUPPORTED_CPUID: {
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
- r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
- cpuid_arg->entries);
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
if (r)
goto out;
@@ -3138,8 +3191,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
goto out;
- r = 0;
- kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
case KVM_X86_SETUP_MCE: {
@@ -4785,7 +4837,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -5104,7 +5156,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
@@ -5114,7 +5166,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
send_ipi = 1;
}
}
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -5261,12 +5313,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
int i;
- raw_spin_lock(&kvm_lock);
+ spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
atomic_set(&kvm_guest_has_master_clock, 0);
- raw_spin_unlock(&kvm_lock);
+ spin_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5539,36 +5591,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
!kvm_event_needs_reinjection(vcpu);
}
-static int vapic_enter(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct page *page;
-
- if (!apic || !apic->vapic_addr)
- return 0;
-
- page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- if (is_error_page(page))
- return -EFAULT;
-
- vcpu->arch.apic->vapic_page = page;
- return 0;
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int idx;
-
- if (!apic || !apic->vapic_addr)
- return;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_release_page_dirty(apic->vapic_page);
- mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-}
-
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
@@ -5889,11 +5911,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- r = vapic_enter(vcpu);
- if (r) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- return r;
- }
r = 1;
while (r > 0) {
@@ -5944,15 +5961,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_resched(vcpu);
+ cond_resched();
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- vapic_exit(vcpu);
-
return r;
}
@@ -6017,7 +6032,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
frag->len -= len;
}
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
vcpu->mmio_needed = 0;
if (vcpu->mmio_is_write)
return 1;
@@ -6600,7 +6615,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
kvm_rip_write(vcpu, 0);
}
-int kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@@ -6611,7 +6626,7 @@ int kvm_arch_hardware_enable(void *garbage)
bool stable, backwards_tsc = false;
kvm_shared_msr_cpu_online();
- ret = kvm_x86_ops->hardware_enable(garbage);
+ ret = kvm_x86_ops->hardware_enable();
if (ret != 0)
return ret;
@@ -6691,10 +6706,10 @@ int kvm_arch_hardware_enable(void *garbage)
return 0;
}
-void kvm_arch_hardware_disable(void *garbage)
+void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops->hardware_disable(garbage);
- drop_user_return_notifiers(garbage);
+ kvm_x86_ops->hardware_disable();
+ drop_user_return_notifiers();
}
int kvm_arch_hardware_setup(void)
@@ -6806,6 +6821,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
static_key_slow_dec(&kvm_no_apic_vcpu);
}
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
if (type)
@@ -6897,7 +6916,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
}
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
int i;
@@ -6918,7 +6937,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
}
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned long npages)
{
int i;
@@ -6976,6 +6996,10 @@ out_free:
return -ENOMEM;
}
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem,
@@ -7114,7 +7138,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
int r;
if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
- is_error_page(work->page))
+ work->wakeup_all)
return;
r = kvm_mmu_reload(vcpu);
@@ -7224,7 +7248,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct x86_exception fault;
trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if (is_error_page(work->page))
+ if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e224f7a671b6..7626d3efa064 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -78,15 +78,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
}
/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
*/
+#define MMIO_GVA_ANY (~(gva_t)0)
+
static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
{
- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
return;
vcpu->arch.mmio_gva = 0;
@@ -94,7 +102,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
{
- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
return true;
return false;
@@ -102,7 +111,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
return true;
return false;
@@ -124,5 +134,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
extern u64 host_xcr0;
+extern unsigned int min_timer_period_us;
+
extern struct static_key kvm_no_apic_vcpu;
#endif
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 25b7ae8d058a..7609e0e421ec 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
*/
#include <asm/checksum.h>
#include <linux/module.h>
+#include <asm/smap.h>
/**
* csum_partial_copy_from_user - Copy and checksum from user space.
@@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
len -= 2;
}
}
+ stac();
isum = csum_partial_copy_generic((__force const void *)src,
dst, len, isum, errp, NULL);
+ clac();
if (unlikely(*errp))
goto out_err;
@@ -82,6 +85,8 @@ __wsum
csum_partial_copy_to_user(const void *src, void __user *dst,
int len, __wsum isum, int *errp)
{
+ __wsum ret;
+
might_sleep();
if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
@@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
}
*errp = 0;
- return csum_partial_copy_generic(src, (void __force *)dst,
- len, isum, NULL, errp);
+ stac();
+ ret = csum_partial_copy_generic(src, (void __force *)dst,
+ len, isum, NULL, errp);
+ clac();
+ return ret;
}
EXPORT_SYMBOL(csum_partial_copy_to_user);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a33081..e04e67753238 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,13 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +47,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +70,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -163,7 +167,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,6 +182,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
+ st->lines = 0;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
@@ -188,17 +193,21 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = (st->current_address - st->start_address);
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;
/*
* We print markers for special areas of address space,
@@ -206,7 +215,15 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ seq_printf(m, "... %lu entr%s skipped ... \n",
+ nskip, nskip == 1 ? "y" : "ies");
+ }
st->marker++;
+ st->lines = 0;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4ae3047..d8b1ff68dbb9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline int
+static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
- /*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue pagefault.
- */
- if (fatal_signal_pending(current)) {
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- if (!(error_code & PF_USER))
- no_context(regs, error_code, address, 0, 0);
- return 1;
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address, 0, 0);
+ return;
}
- if (!(fault & VM_FAULT_ERROR))
- return 0;
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
@@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
- return 1;
+ return;
}
up_read(&current->mm->mmap_sem);
@@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
- return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -989,6 +980,12 @@ static int fault_in_kernel_space(unsigned long address)
static inline bool smap_violation(int error_code, struct pt_regs *regs)
{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
if (error_code & PF_USER)
return false;
@@ -1011,9 +1008,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long address;
struct mm_struct *mm;
int fault;
- int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
- (write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
@@ -1083,6 +1078,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
@@ -1091,11 +1087,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- if (static_cpu_has(X86_FEATURE_SMAP)) {
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
@@ -1109,6 +1103,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
}
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
@@ -1187,9 +1184,17 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (mm_fault_error(regs, error_code, address, fault))
- return;
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ return;
+
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
/*
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71d0115..7e73e8c69096 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -16,169 +16,6 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
- struct vm_area_struct *vma,
- unsigned long addr, pgoff_t idx)
-{
- unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
- svma->vm_start;
- unsigned long sbase = saddr & PUD_MASK;
- unsigned long s_end = sbase + PUD_SIZE;
-
- /* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
-
- /*
- * match the virtual addresses, permission and the alignment of the
- * page table page.
- */
- if (pmd_index(addr) != pmd_index(saddr) ||
- vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
- return 0;
-
- return saddr;
-}
-
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
-}
-
-/*
- * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
- * and returns the corresponding pte. While this is not necessary for the
- * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
- */
-static pte_t *
-huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
- struct vm_area_struct *vma = find_vma(mm, addr);
- struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- struct vm_area_struct *svma;
- unsigned long saddr;
- pte_t *spte = NULL;
- pte_t *pte;
-
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-
- saddr = page_table_shareable(svma, vma, addr, idx);
- if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
- if (spte) {
- get_page(virt_to_page(spte));
- break;
- }
- }
- }
-
- if (!spte)
- goto out;
-
- spin_lock(&mm->page_table_lock);
- if (pud_none(*pud))
- pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
- put_page(virt_to_page(spte));
- spin_unlock(&mm->page_table_lock);
-out:
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_unlock(&mapping->i_mmap_mutex);
- return pte;
-}
-
-/*
- * unmap huge page backed by shared pte.
- *
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
- * called with vma->vm_mm->page_table_lock held.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
- */
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
-
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
- return 0;
-
- pud_clear(pud);
- put_page(virt_to_page(ptep));
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
- return 1;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pgd;
- pud_t *pud;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
- if (pud) {
- if (sz == PUD_SIZE) {
- pte = (pte_t *)pud;
- } else {
- BUG_ON(sz != PMD_SIZE);
- if (pud_none(*pud))
- pte = huge_pmd_share(mm, addr, pud);
- else
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
- }
- }
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-
- return pte;
-}
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_large(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
- return (pte_t *) pmd;
-}
-
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
@@ -240,30 +77,6 @@ int pud_huge(pud_t pud)
return !!(pud_val(pud) & _PAGE_PSE);
}
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
#endif
/* x86_64 also uses this file */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1f34e9219775..7a5bf1b76e2f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num)
return __va(pfn << PAGE_SHIFT);
}
-/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
+/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9a1e6583910c..86c758de4b34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..5c1ae28825cd 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -112,12 +112,14 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ mm->mmap_legacy_base = mmap_legacy_base();
+ mm->mmap_base = mmap_base();
+
if (mmap_is_legacy()) {
- mm->mmap_base = mmap_legacy_base();
+ mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
mm->unmap_area = arch_unmap_area;
} else {
- mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
mm->unmap_area = arch_unmap_area_topdown;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480c2d71..aabdf762f592 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -389,7 +389,7 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
psize = page_level_size(level);
pmask = page_level_mask(level);
offset = virt_addr & ~pmask;
- phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
return (phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 877b9a1b2152..01495755701b 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -140,7 +140,7 @@ bpf_slow_path_byte_msh:
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
- mov $SIZE,%ecx; /* size */ \
+ mov $SIZE,%edx; /* size */ \
call bpf_internal_load_pointer_neg_helper; \
test %rax,%rax; \
pop SKBDATA; \
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f66b54086ce5..0c966fecfb8c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -324,15 +324,21 @@ void bpf_jit_compile(struct sk_filter *fp)
EMIT2(0x89, 0xd0); /* mov %edx,%eax */
break;
case BPF_S_ALU_MOD_K: /* A %= K; */
+ if (K == 1) {
+ CLEAR_A();
+ break;
+ }
EMIT2(0x31, 0xd2); /* xor %edx,%edx */
EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
EMIT2(0xf7, 0xf1); /* div %ecx */
EMIT2(0x89, 0xd0); /* mov %edx,%eax */
break;
- case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
- EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
- EMIT(K, 4);
- EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
+ case BPF_S_ALU_DIV_K: /* A /= K */
+ if (K == 1)
+ break;
+ EMIT2(0x31, 0xd2); /* xor %edx,%edx */
+ EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
+ EMIT2(0xf7, 0xf1); /* div %ecx */
break;
case BPF_S_ALU_AND_X:
seen |= SEEN_XREG;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 94919e307f8e..2883f0840201 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res,
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
}
return start;
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index d2fbcedcf6ea..816e940b3998 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -60,34 +60,19 @@
static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
-struct efi __read_mostly efi = {
- .mps = EFI_INVALID_TABLE_ADDR,
- .acpi = EFI_INVALID_TABLE_ADDR,
- .acpi20 = EFI_INVALID_TABLE_ADDR,
- .smbios = EFI_INVALID_TABLE_ADDR,
- .sal_systab = EFI_INVALID_TABLE_ADDR,
- .boot_info = EFI_INVALID_TABLE_ADDR,
- .hcdp = EFI_INVALID_TABLE_ADDR,
- .uga = EFI_INVALID_TABLE_ADDR,
- .uv_systab = EFI_INVALID_TABLE_ADDR,
-};
-EXPORT_SYMBOL(efi);
-
struct efi_memory_map memmap;
static struct efi efi_phys __initdata;
static efi_system_table_t efi_systab __initdata;
-unsigned long x86_efi_facility;
+static __initdata efi_config_table_type_t arch_tables[] = {
+#ifdef CONFIG_X86_UV
+ {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab},
+#endif
+ {NULL_GUID, NULL, 0},
+};
-/*
- * Returns 1 if 'facility' is enabled, 0 otherwise.
- */
-int efi_enabled(int facility)
-{
- return test_bit(facility, &x86_efi_facility) != 0;
-}
-EXPORT_SYMBOL(efi_enabled);
+u64 efi_setup; /* efi setup_data physical address */
static bool __initdata disable_runtime = false;
static int __init setup_noefi(char *arg)
@@ -397,6 +382,8 @@ int __init efi_memblock_x86_reserve_range(void)
memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
+ efi.memmap = &memmap;
+
return 0;
}
@@ -438,7 +425,7 @@ void __init efi_reserve_boot_services(void)
* - Not within any part of the kernel
* - Not the bios reserved area
*/
- if ((start+size >= __pa_symbol(_text)
+ if ((start + size > __pa_symbol(_text)
&& start <= __pa_symbol(_end)) ||
!e820_all_mapped(start, start+size, E820_RAM) ||
memblock_is_region_reserved(start, size)) {
@@ -454,7 +441,7 @@ void __init efi_reserve_boot_services(void)
void __init efi_unmap_memmap(void)
{
- clear_bit(EFI_MEMMAP, &x86_efi_facility);
+ clear_bit(EFI_MEMMAP, &efi.flags);
if (memmap.map) {
early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
@@ -576,80 +563,6 @@ static int __init efi_systab_init(void *phys)
return 0;
}
-static int __init efi_config_init(u64 tables, int nr_tables)
-{
- void *config_tables, *tablep;
- int i, sz;
-
- if (efi_enabled(EFI_64BIT))
- sz = sizeof(efi_config_table_64_t);
- else
- sz = sizeof(efi_config_table_32_t);
-
- /*
- * Let's see what config tables the firmware passed to us.
- */
- config_tables = early_ioremap(tables, nr_tables * sz);
- if (config_tables == NULL) {
- pr_err("Could not map Configuration table!\n");
- return -ENOMEM;
- }
-
- tablep = config_tables;
- pr_info("");
- for (i = 0; i < efi.systab->nr_tables; i++) {
- efi_guid_t guid;
- unsigned long table;
-
- if (efi_enabled(EFI_64BIT)) {
- u64 table64;
- guid = ((efi_config_table_64_t *)tablep)->guid;
- table64 = ((efi_config_table_64_t *)tablep)->table;
- table = table64;
-#ifdef CONFIG_X86_32
- if (table64 >> 32) {
- pr_cont("\n");
- pr_err("Table located above 4GB, disabling EFI.\n");
- early_iounmap(config_tables,
- efi.systab->nr_tables * sz);
- return -EINVAL;
- }
-#endif
- } else {
- guid = ((efi_config_table_32_t *)tablep)->guid;
- table = ((efi_config_table_32_t *)tablep)->table;
- }
- if (!efi_guidcmp(guid, MPS_TABLE_GUID)) {
- efi.mps = table;
- pr_cont(" MPS=0x%lx ", table);
- } else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) {
- efi.acpi20 = table;
- pr_cont(" ACPI 2.0=0x%lx ", table);
- } else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) {
- efi.acpi = table;
- pr_cont(" ACPI=0x%lx ", table);
- } else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) {
- efi.smbios = table;
- pr_cont(" SMBIOS=0x%lx ", table);
-#ifdef CONFIG_X86_UV
- } else if (!efi_guidcmp(guid, UV_SYSTEM_TABLE_GUID)) {
- efi.uv_systab = table;
- pr_cont(" UVsystab=0x%lx ", table);
-#endif
- } else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) {
- efi.hcdp = table;
- pr_cont(" HCDP=0x%lx ", table);
- } else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) {
- efi.uga = table;
- pr_cont(" UGA=0x%lx ", table);
- }
- tablep += sz;
- }
- pr_cont("\n");
- early_iounmap(config_tables, efi.systab->nr_tables * sz);
- return 0;
-}
-
static int __init efi_runtime_init(void)
{
efi_runtime_services_t *runtime;
@@ -725,7 +638,11 @@ void __init efi_init(void)
if (efi_systab_init(efi_phys.systab))
return;
- set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
+ set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+
+ efi.config_table = (unsigned long)efi.systab->tables;
+ efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
+ efi.runtime = (unsigned long)efi.systab->runtime;
/*
* Show what we know for posterity
@@ -743,10 +660,10 @@ void __init efi_init(void)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
- if (efi_config_init(efi.systab->tables, efi.systab->nr_tables))
+ if (efi_config_init(arch_tables))
return;
- set_bit(EFI_CONFIG_TABLES, &x86_efi_facility);
+ set_bit(EFI_CONFIG_TABLES, &efi.flags);
/*
* Note: We currently don't support runtime services on an EFI
@@ -758,20 +675,13 @@ void __init efi_init(void)
else {
if (disable_runtime || efi_runtime_init())
return;
- set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility);
+ set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
}
if (efi_memmap_init())
return;
- set_bit(EFI_MEMMAP, &x86_efi_facility);
-
-#ifdef CONFIG_X86_32
- if (efi_is_native()) {
- x86_platform.get_wallclock = efi_get_time;
- x86_platform.set_wallclock = efi_set_rtc_mmss;
- }
-#endif
+ set_bit(EFI_MEMMAP, &efi.flags);
#if EFI_DEBUG
print_efi_memmap();
@@ -814,34 +724,6 @@ static void __init runtime_code_page_mkexec(void)
}
}
-/*
- * We can't ioremap data in EFI boot services RAM, because we've already mapped
- * it as RAM. So, look it up in the existing EFI memory map instead. Only
- * callable after efi_enter_virtual_mode and before efi_free_boot_services.
- */
-void __iomem *efi_lookup_mapped_addr(u64 phys_addr)
-{
- void *p;
- if (WARN_ON(!memmap.map))
- return NULL;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- u64 size = md->num_pages << EFI_PAGE_SHIFT;
- u64 end = md->phys_addr + size;
- if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
- md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
- if (!md->virt_addr)
- continue;
- if (phys_addr >= md->phys_addr && phys_addr < end) {
- phys_addr += md->virt_addr - md->phys_addr;
- return (__force void __iomem *)(unsigned long)phys_addr;
- }
- }
- return NULL;
-}
-
void efi_memory_uc(u64 addr, unsigned long size)
{
unsigned long page_shift = 1UL << EFI_PAGE_SHIFT;
@@ -910,10 +792,13 @@ void __init efi_enter_virtual_mode(void)
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
- md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
+ if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
+#ifdef CONFIG_X86_64
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+#endif
+ continue;
+ }
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 88692871823f..9cac82588cbc 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -73,9 +73,10 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
-march=i386 -mregparm=3 \
-include $(srctree)/$(src)/../../boot/code16gcc.h \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
+ -mno-mmx -mno-sse \
$(call cc-option, -ffreestanding) \
$(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
+ $(call cc-option, -fno-unit-at-a-time)) \
$(call cc-option, -fno-stack-protector) \
$(call cc-option, -mpreferred-stack-boundary=2)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..63a899304d27 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -212,10 +212,10 @@
203 common sched_setaffinity sys_sched_setaffinity
204 common sched_getaffinity sys_sched_getaffinity
205 64 set_thread_area
-206 common io_setup sys_io_setup
+206 64 io_setup sys_io_setup
207 common io_destroy sys_io_destroy
208 common io_getevents sys_io_getevents
-209 common io_submit sys_io_submit
+209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
212 common lookup_dcookie sys_lookup_dcookie
@@ -356,3 +356,5 @@
540 x32 process_vm_writev compat_sys_process_vm_writev
541 x32 setsockopt compat_sys_setsockopt
542 x32 getsockopt compat_sys_getsockopt
+543 x32 io_setup compat_sys_io_setup
+544 x32 io_submit compat_sys_io_submit
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927e..156344448d19 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -878,7 +878,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
- int ret = 0;
pfn = page_to_pfn(page);
if (!PageHighMem(page)) {
@@ -925,8 +924,8 @@ int m2p_add_override(unsigned long mfn, struct page *page,
* frontend pages while they are being shared with the backend,
* because mfn_to_pfn (that ends up being called by GUPF) will
* return the backend pfn rather than the frontend pfn. */
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
- if (ret == 0 && get_phys_to_machine(pfn) == mfn)
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == mfn)
set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
return 0;
@@ -941,7 +940,6 @@ int m2p_remove_override(struct page *page,
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
- int ret = 0;
pfn = page_to_pfn(page);
mfn = get_phys_to_machine(pfn);
@@ -1019,8 +1017,8 @@ int m2p_remove_override(struct page *page,
* the original pfn causes mfn_to_pfn(mfn) to return the frontend
* pfn again. */
mfn &= ~FOREIGN_FRAME_BIT;
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
- if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
m2p_find_override(mfn) == NULL)
set_phys_to_machine(pfn, mfn);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 94eac5c85cdc..0a9fb7a0b452 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -313,6 +313,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
e820_add_region(start, end - start, type);
}
+void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+{
+ struct e820entry *entry;
+ unsigned int i;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ if (entry->type == E820_UNUSABLE)
+ entry->type = E820_RAM;
+ }
+}
+
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
@@ -353,6 +364,17 @@ char * __init xen_memory_setup(void)
}
BUG_ON(rc);
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable(map, memmap.nr_entries);
+
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d99cae8147d1..570c9a5c4d3f 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -245,6 +245,15 @@ static void __init xen_smp_prepare_boot_cpu(void)
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
+#ifdef CONFIG_X86_32
+ /*
+ * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+ * expects __USER_DS
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
}
@@ -667,8 +676,15 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
- rc = native_cpu_up(cpu, tidle);
- WARN_ON (xen_smp_intr_init(cpu));
+ /*
+ * xen_smp_intr_init() needs to run before native_cpu_up()
+ * so that IPI vectors are set up on the booting CPU before
+ * it is marked online in native_cpu_up().
+ */
+ rc = xen_smp_intr_init(cpu);
+ WARN_ON(rc);
+ if (!rc)
+ rc = native_cpu_up(cpu, tidle);
return rc;
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 3d88bfdf9e1c..13e8935e2eab 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -36,9 +36,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
-/* unused ns of stolen and blocked time */
+/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
-static DEFINE_PER_CPU(u64, xen_residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
@@ -115,7 +114,7 @@ static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
- s64 blocked, runnable, offline, stolen;
+ s64 runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
@@ -125,7 +124,6 @@ static void do_stolen_accounting(void)
snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
- blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
@@ -141,17 +139,6 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
-
- /* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. */
- blocked += __this_cpu_read(xen_residual_blocked);
-
- if (blocked < 0)
- blocked = 0;
-
- ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
- __this_cpu_write(xen_residual_blocked, blocked);
- account_idle_ticks(ticks);
}
/* Get the TSC speed from Xen */