aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/eboot.c9
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c4
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/desc.h15
-rw-r--r--arch/x86/include/asm/hugetlb.h1
-rw-r--r--arch/x86/include/asm/iommu_table.h11
-rw-r--r--arch/x86/include/asm/kasan.h34
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mmu.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h48
-rw-r--r--arch/x86/include/asm/page_64_types.h12
-rw-r--r--arch/x86/include/asm/paravirt.h6
-rw-r--r--arch/x86/include/asm/perf_event.h8
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/processor.h36
-rw-r--r--arch/x86/include/asm/segment.h15
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/string_64.h18
-rw-r--r--arch/x86/include/asm/thread_info.h27
-rw-r--r--arch/x86/include/asm/tlbflush.h77
-rw-r--r--arch/x86/include/asm/virtext.h5
-rw-r--r--arch/x86/include/asm/x86_init.h3
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h2
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h6
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/sleep.c9
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/apic/apic.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c8
-rw-r--r--arch/x86/kernel/cpu/common.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c3
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c12
-rw-r--r--arch/x86/kernel/crash.c7
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/early-quirks.c105
-rw-r--r--arch/x86/kernel/entry_32.S5
-rw-r--r--arch/x86/kernel/entry_64.S300
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c13
-rw-r--r--arch/x86/kernel/head_32.S46
-rw-r--r--arch/x86/kernel/head_64.S50
-rw-r--r--arch/x86/kernel/i387.c3
-rw-r--r--arch/x86/kernel/ioport.c12
-rw-r--r--arch/x86/kernel/kprobes/core.c19
-rw-r--r--arch/x86/kernel/ldt.c262
-rw-r--r--arch/x86/kernel/module.c14
-rw-r--r--arch/x86/kernel/nmi.c123
-rw-r--r--arch/x86/kernel/paravirt.c16
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c70
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kernel/step.c8
-rw-r--r--arch/x86/kernel/sysfb_efi.c14
-rw-r--r--arch/x86/kernel/tsc.c17
-rw-r--r--arch/x86/kernel/tsc_msr.c2
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c10
-rw-r--r--arch/x86/kernel/x86_init.c10
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c45
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c18
-rw-r--r--arch/x86/kvm/vmx.c35
-rw-r--r--arch/x86/kvm/x86.c56
-rw-r--r--arch/x86/lib/memcpy_64.S6
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S10
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c12
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c209
-rw-r--r--arch/x86/mm/kmmio.c88
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/net/bpf_jit_comp.c35
-rw-r--r--arch/x86/pci/acpi.c17
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/fixup.c7
-rw-r--r--arch/x86/pci/intel_mid_pci.c10
-rw-r--r--arch/x86/pci/irq.c7
-rw-r--r--arch/x86/pci/xen.c28
-rw-r--r--arch/x86/platform/efi/efi.c67
-rw-r--r--arch/x86/power/cpu.c14
-rw-r--r--arch/x86/realmode/Makefile2
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/realmode/rm/Makefile1
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/xen/Kconfig4
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c70
-rw-r--r--arch/x86/xen/xen-ops.h6
111 files changed, 1709 insertions, 702 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 25d692ddae4e..7d8f07b25f71 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,6 +84,7 @@ config X86
select HAVE_CMPXCHG_LOCAL
select HAVE_CMPXCHG_DOUBLE
select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_USER_RETURN_NOTIFIER
select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select HAVE_ARCH_JUMP_LABEL
@@ -174,7 +175,7 @@ config SBUS
config NEED_DMA_MAP_STATE
def_bool y
- depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
+ depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
config NEED_SG_DMA_LENGTH
def_bool y
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3db07f30636f..c45301c272d0 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,6 +9,8 @@
# Changed by many, many contributors over the years.
#
+KASAN_SANITIZE := n
+
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -160,6 +162,9 @@ isoimage: $(obj)/bzImage
for i in lib lib64 share end ; do \
if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
+ if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \
+ cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \
+ fi ; \
break ; \
fi ; \
if [ $$i = end ] ; then exit 1 ; fi ; \
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 30c0acf4ea6c..2959cca0b90c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,6 +4,8 @@
# create a compressed vmlinux image from the original vmlinux
#
+KASAN_SANITIZE := n
+
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 1acf605a646d..2da9cef8ee43 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -13,8 +13,7 @@
#include <asm/setup.h>
#include <asm/desc.h>
-#undef memcpy /* Use memcpy from misc.c */
-
+#include "../string.h"
#include "eboot.h"
static efi_system_table_t *sys_table;
@@ -1110,6 +1109,8 @@ struct boot_params *make_boot_params(struct efi_config *c)
if (!cmdline_ptr)
goto fail;
hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
+ /* Fill in upper bits of command line address, NOP on 32 bit */
+ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32;
hdr->ramdisk_image = 0;
hdr->ramdisk_size = 0;
@@ -1192,6 +1193,10 @@ static efi_status_t setup_e820(struct boot_params *params,
unsigned int e820_type = 0;
unsigned long m = efi->efi_memmap;
+#ifdef CONFIG_X86_64
+ m |= (u64)efi->efi_memmap_hi << 32;
+#endif
+
d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
switch (d->type) {
case EFI_RESERVED_TYPE:
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 24e3e569a13c..04477d68403f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
* we just keep it from happening
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_KASAN
#ifdef CONFIG_X86_32
#define _ASM_X86_DESC_H 1
#endif
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..de1d72e3ec59 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -291,6 +291,7 @@ static struct ahash_alg ghash_async_alg = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
.cra_priority = 400,
+ .cra_ctxsize = sizeof(struct ghash_async_ctx),
.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_type = &crypto_ahash_type,
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index d42c9b7fadcf..418319b3763e 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -457,10 +457,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
req = cast_mcryptd_ctx_to_req(req_ctx);
if (irqs_disabled())
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
else {
local_bh_disable();
- rctx->complete(&req->base, ret);
+ req_ctx->complete(&req->base, ret);
local_bh_enable();
}
}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffe71228fc10..a39e89eaa763 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -422,6 +422,7 @@ ENTRY(ia32_syscall)
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
PARAVIRT_ADJUST_EXCEPTION_FRAME
+ ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
/*
* No need to follow this irqs on/off section: the syscall
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 0ab4f9fd2687..3a45668f6dc3 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -50,6 +50,7 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity);
+extern void (*__acpi_unregister_gsi)(u32 gsi);
static inline void disable_acpi(void)
{
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 465b309af254..dbaf844ddcb1 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -651,8 +651,8 @@ static inline void entering_irq(void)
static inline void entering_ack_irq(void)
{
- ack_APIC_irq();
entering_irq();
+ ack_APIC_irq();
}
static inline void exiting_irq(void)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..69126184c609 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -280,21 +280,6 @@ static inline void clear_LDT(void)
set_ldt(NULL, 0);
}
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
- set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
- preempt_disable();
- load_LDT_nolock(pc);
- preempt_enable();
-}
-
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 68c05398bba9..7aadd3cea843 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -4,6 +4,7 @@
#include <asm/page.h>
#include <asm-generic/hugetlb.h>
+#define hugepages_supported() cpu_has_pse
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index f42a04735a0a..e37d6b3ad983 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -79,11 +79,12 @@ struct iommu_table_entry {
* d). Similar to the 'init', except that this gets called from pci_iommu_init
* where we do have a memory allocator.
*
- * The standard vs the _FINISH differs in that the _FINISH variant will
- * continue detecting other IOMMUs in the call list after the
- * the detection routine returns a positive number. The _FINISH will
- * stop the execution chain. Both will still call the 'init' and
- * 'late_init' functions if they are set.
+ * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
+ * in that the former will continue detecting other IOMMUs in the call
+ * list after the detection routine returns a positive number, while the
+ * latter will stop the execution chain upon first successful detection.
+ * Both variants will still call the 'init' and 'late_init' functions if
+ * they are set.
*/
#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
__IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..491e4fd7754e
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
+ (0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+extern pte_t kasan_zero_pte[];
+extern pte_t kasan_zero_pmd[];
+extern pte_t kasan_zero_pud[];
+
+#ifdef CONFIG_KASAN
+void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_init(void);
+#else
+static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c0709ff2f38..306d152336cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -571,7 +571,7 @@ struct kvm_arch {
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
- int vapics_in_nmi_mode;
+ atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..b6b7bc3f5d26 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,8 +9,7 @@
* we put the segment information here.
*/
typedef struct {
- void *ldt;
- int size;
+ struct ldt_struct *ldt;
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 166af2a8e865..23e0625a6183 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -20,6 +20,50 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
#endif /* !CONFIG_PARAVIRT */
/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+ /*
+ * Xen requires page-aligned LDTs with special permissions. This is
+ * needed to prevent us from installing evil descriptors such as
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+ struct desc_struct *entries;
+ int size;
+};
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+ struct ldt_struct *ldt;
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt))
+ set_ldt(ldt->entries, ldt->size);
+ else
+ clear_LDT();
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
@@ -55,7 +99,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Load the LDT, if the LDT is different: */
if (unlikely(prev->context.ldt != next->context.ldt))
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
#ifdef CONFIG_SMP
else {
@@ -77,7 +121,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
}
#endif
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#define THREAD_SIZE_ORDER 2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
#define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cd6e1610e29e..a9d76e02301b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
}
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8dfc9fd094a3..024fa1a20f15 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -159,6 +159,14 @@ struct x86_pmu_capability {
*/
#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55)
+
/*
* IBS cpuid feature detection
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index aa97a070f09f..081d6f45e006 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -99,6 +99,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline int pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY;
+}
+
static inline int pmd_young(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_ACCESSED;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec794732..26d5e05a7def 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -578,39 +578,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features |= mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features &= ~mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
-}
-
typedef struct {
unsigned long seg;
} mm_segment_t;
@@ -885,7 +852,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define task_pt_regs(task) \
({ \
struct pt_regs *__regs__; \
- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \
+ TOP_OF_KERNEL_STACK_PADDING); \
__regs__ - 1; \
})
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6f1c3a8a33ab..bcc9a2f46c62 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -212,10 +212,21 @@
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
#ifdef __KERNEL__
+
+/*
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT. For simplicity, it's a real array with one entry point
+ * every nine bytes. That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
+ */
+#define EARLY_IDT_HANDLER_SIZE 9
+
#ifndef __ASSEMBLY__
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
+
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
#ifdef CONFIG_TRACING
-#define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handler_array early_idt_handler_array
#endif
/*
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return native_read_cr4_safe();
}
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..e4661196994e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
function. */
#define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
extern void *memcpy(void *to, const void *from, size_t len);
#else
-extern void *__memcpy(void *to, const void *from, size_t len);
#define memcpy(dst, src, len) \
({ \
size_t __len = (len); \
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
int memcmp(const void *cs, const void *ct, size_t count);
size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
char *strcat(char *dest, const char *src);
int strcmp(const char *cs, const char *ct);
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 223b9f0d463f..766fa5cc15b4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,6 +13,33 @@
#include <asm/types.h>
/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack. We do it because of a nasty
+ * 32-bit corner case. On x86_32, the hardware stack frame is
+ * variable-length. Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0. On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault. (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
* low level task data that entry.S needs immediate access to
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,75 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+struct tlb_state {
+#ifdef CONFIG_SMP
+ struct mm_struct *active_mm;
+ int state;
+#endif
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 | mask) != cr4) {
+ cr4 |= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 & ~mask) != cr4) {
+ cr4 &= ~mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static inline void __native_flush_tlb(void)
{
native_write_cr3(native_read_cr3());
@@ -24,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;
- cr4 = native_read_cr4();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
@@ -184,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
static inline void reset_lazy_tlbstate(void)
{
this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>
/*
* VMX functions:
@@ -40,12 +41,12 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}
static inline int cpu_vmx_enabled(void)
{
- return read_cr4() & X86_CR4_VMXE;
+ return __read_cr4() & X86_CR4_VMXE;
}
/** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index e45e4da96bf1..f58a9c7a3c86 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -172,7 +172,6 @@ struct x86_platform_ops {
struct pci_dev;
struct msi_msg;
-struct msi_desc;
struct x86_msi_ops {
int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
@@ -183,8 +182,6 @@ struct x86_msi_ops {
void (*teardown_msi_irqs)(struct pci_dev *dev);
void (*restore_msi_irqs)(struct pci_dev *dev);
int (*setup_hpet_msi)(unsigned int irq, unsigned int id);
- u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag);
- u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag);
};
struct IO_APIC_route_entry;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d866959e5685..d2ad00a42234 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,4 +57,6 @@ static inline bool xen_x2apic_para_available(void)
}
#endif
+extern void xen_set_iopl_mask(unsigned mask);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index e21331ce368f..177889cd0505 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -69,6 +69,12 @@
#define MSR_LBR_CORE_FROM 0x00000040
#define MSR_LBR_CORE_TO 0x00000060
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYCLES 0xffff
+
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index b5d7640abc5d..8a4add8e4639 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -100,6 +100,7 @@
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
{ SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
{ SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
{ SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
{ SVM_EXIT_INTR, "interrupt" }, \
{ SVM_EXIT_NMI, "nmi" }, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f1e77440b2b..2e7f86e8a4a6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..adb3eaf8fe2a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
#include <asm/cacheflush.h>
#include <asm/realmode.h>
+#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
@@ -78,7 +79,7 @@ int x86_acpi_suspend_lowlevel(void)
header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = __read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
do_suspend_lowlevel();
+ unpause_graph_tracing();
return 0;
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index f04dbb3069b8..29f0c55d6efc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -69,8 +69,8 @@ int amd_cache_northbridges(void)
while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
i++;
- if (i == 0)
- return 0;
+ if (!i)
+ return -ENODEV;
nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ba6cc041edb1..f7eef03fd4b3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -366,6 +366,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
apic_write(APIC_LVTT, lvtt_value);
if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+
printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 791ffdafd674..12706a1a4a1e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3159,7 +3159,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- __write_msi_msg(data->msi_desc, &msg);
+ __pci_write_msi_msg(data->msi_desc, &msg);
return IRQ_SET_MASK_OK_NOCOPY;
}
@@ -3170,8 +3170,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
*/
static struct irq_chip msi_chip = {
.name = "PCI-MSI",
- .irq_unmask = unmask_msi_irq,
- .irq_mask = mask_msi_irq,
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
.irq_ack = ack_apic_edge,
.irq_set_affinity = msi_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
@@ -3197,7 +3197,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
* MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
*/
if (!irq_offset)
- write_msi_msg(irq, &msg);
+ pci_write_msi_msg(irq, &msg);
setup_remapped_irq(irq, irq_cfg(irq), chip);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cfa9b5b2c27a..88635b301694 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set_bits(X86_CR4_SMEP);
}
static __init int setup_disable_smap(char *arg)
@@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set_bits(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear_bits(X86_CR4_SMAP);
#endif
}
}
@@ -1304,6 +1305,12 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
@@ -1322,7 +1329,7 @@ void cpu_init(void)
pr_debug("Initializing CPU#%d\n", cpu);
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1376,7 +1383,7 @@ void cpu_init(void)
load_sp0(t, &current->thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
clear_all_debug_regs();
dbg_restore_debug_regs();
@@ -1398,12 +1405,18 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
show_ucode_info_early();
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
switch_to_new_gdt(cpu);
@@ -1419,7 +1432,7 @@ void cpu_init(void)
load_sp0(t, thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index aaad25e9f088..1492d3adfdac 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -45,6 +45,7 @@
#include <linux/work-simple.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -1509,7 +1510,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..30692ac88d1e 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
#include <linux/smp.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -59,7 +60,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..590cc753ba8f 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -31,7 +32,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ec9df6f9cd47..5e109a31f62b 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
int i;
- while (leftover) {
+ while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
@@ -171,7 +171,7 @@ static void post_set(void)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..c832e9f54cd6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,8 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -1328,7 +1330,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_STARTING:
if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1834,9 +1836,9 @@ static void change_rdpmc(void *info)
bool enable = !!(unsigned long)info;
if (enable)
- set_in_cr4(X86_CR4_PCE);
+ cr4_set_bits(X86_CR4_PCE);
else
- clear_in_cr4(X86_CR4_PCE);
+ cr4_clear_bits(X86_CR4_PCE);
}
static ssize_t set_attr_rdpmc(struct device *cdev,
@@ -1986,21 +1988,25 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ struct ldt_struct *ldt;
+
if (idx > LDT_ENTRIES)
return 0;
- if (idx > current->active_mm->context.size)
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = lockless_dereference(current->active_mm->context.ldt);
+ if (!ldt || idx > ldt->size)
return 0;
- desc = current->active_mm->context.ldt;
+ desc = &ldt->entries[idx];
} else {
if (idx > GDT_ENTRIES)
return 0;
- desc = raw_cpu_ptr(gdt_page.gdt);
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
}
- return get_desc_base(desc + idx);
+ return get_desc_base(desc);
}
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..df11583a9041 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2604,13 +2604,13 @@ __init int intel_pmu_init(void)
* counter, so do not extend mask to generic counters
*/
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != FIXED_EVENT_FLAGS
- || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- continue;
+ if (c->cmask == FIXED_EVENT_FLAGS
+ && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
-
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
+ c->idxmsk64 &=
+ ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+ c->weight = hweight64(c->idxmsk64);
}
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f5ab56d14287..3af40315a127 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -183,10 +183,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
}
#ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(unsigned long start_pfn,
- unsigned long nr_pfn, void *arg)
+static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
{
- int *nr_ranges = arg;
+ unsigned int *nr_ranges = arg;
(*nr_ranges)++;
return 0;
@@ -212,7 +211,7 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
ced->image = image;
- walk_system_ram_range(0, -1, &nr_ranges,
+ walk_system_ram_res(0, -1, &nr_ranges,
get_nr_ram_ranges_callback);
ced->max_nr_ranges = nr_ranges;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
printk("SMP ");
#endif
#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
+ printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+ printk("KASAN");
#endif
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 2e1a6853e00c..2fa494f59828 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -11,7 +11,11 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
@@ -21,6 +25,9 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -565,6 +579,61 @@ static void __init force_disable_hpet(int num, int slot, int func)
#endif
}
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ dev_err("Cannot power up Apple AirPort card\n");
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ dev_err("Cannot iomap Apple AirPort card\n");
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
@@ -578,12 +647,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -609,9 +672,13 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -620,7 +687,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -629,6 +696,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -656,25 +724,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
if (!(type & 0x80))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 5af2338b30c4..4c1a1b5bfaab 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -414,7 +414,7 @@ sysenter_past_esp:
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
@@ -998,6 +998,9 @@ ENTRY(xen_hypervisor_callback)
ENTRY(xen_do_upcall)
1: mov %esp, %eax
call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index abaa7cb9c188..f4cbb1272e27 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1190,6 +1190,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
popq %rsp
CFI_DEF_CFA_REGISTER rsp
decl PER_CPU_VAR(irq_count)
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
jmp error_exit
CFI_ENDPROC
END(xen_do_hypervisor_callback)
@@ -1442,7 +1445,18 @@ END(error_exit)
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
+ /*
+ * Fix up the exception frame if we're on Xen.
+ * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
+ * one value to the stack on native, so it may clobber the rdx
+ * scratch slot, but it won't clobber any of the important
+ * slots past it.
+ *
+ * Xen is a different story, because the Xen frame itself overlaps
+ * the "NMI executing" variable.
+ */
PARAVIRT_ADJUST_EXCEPTION_FRAME
+
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
@@ -1460,11 +1474,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1473,52 +1488,174 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
CFI_REL_OFFSET rdx, 0
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ *
+ * We also must not push anything to the stack before switching
+ * stacks lest we corrupt the "NMI executing" variable.
+ */
+ SWAPGS_UNSAFE_STACK
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(kernel_stack), %rsp
+ addq $KERNEL_STACK_OFFSET, %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+
+ addq $6*8, %rsp /* skip bx, bp, and r12-r15 */
+ jmp restore_args
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
+
CFI_REMEMBER_STATE
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 1*8
@@ -1537,60 +1674,23 @@ nested_nmi_out:
popq_cfi %rdx
CFI_RESTORE rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
CFI_RESTORE_STATE
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
CFI_RESTORE rdx
- /* Set the NMI executing variable on the stack. */
+ /* Set "NMI executing" on the stack. */
pushq_cfi $1
- /*
- * Leave room for the "copied" frame
- */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
CFI_ADJUST_CFA_OFFSET 5*8
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq_cfi 11*8(%rsp)
.endr
@@ -1598,6 +1698,7 @@ first_nmi:
/* Everything up to here is safe from nested NMIs */
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1606,16 +1707,21 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
- */
-repeat_nmi:
- /*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi.
+ *
+ * Set "NMI executing" in case we came back here via IRET.
*/
movq $1, 10*8(%rsp)
- /* Make another copy, this one may be modified by nested NMIs */
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
addq $(10*8), %rsp
CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
@@ -1626,9 +1732,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
@@ -1643,29 +1749,11 @@ end_repeat_nmi:
call save_paranoid
DEFAULT_FRAME 0
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
-
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
@@ -1674,9 +1762,23 @@ nmi_restore:
/* Pop the extra iret frame at once */
RESTORE_ALL 6*8
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
- jmp irq_return
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
+ INTERRUPT_RETURN
CFI_ENDPROC
END(nmi)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);
/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..b111ab5c4509 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
#include <asm/bios_ebda.h>
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
+#include <asm/kasan.h>
/*
* Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
next_early_pgt = 0;
- write_cr3(__pa(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_level4_pgt));
}
/* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
return -1;
again:
@@ -155,14 +156,18 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();
+ kasan_map_early_shadow(early_level4_pgt);
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handlers[i]);
+ set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
copy_bootdata(__va(real_mode_data));
@@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
+ kasan_map_early_shadow(init_level4_pgt);
+
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..e7be5290fe1f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -61,9 +61,16 @@
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
-/* Number of possible pages in the lowmem region */
-LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
-
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
/* Enough space to fit pagetables for the low memory linear map */
MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
@@ -477,21 +484,22 @@ is486:
__INIT
setup_once:
/*
- * Set up a idt with 256 entries pointing to ignore_int,
- * interrupt gates. It doesn't actually load idt - that needs
- * to be done on each CPU. Interrupts are enabled elsewhere,
- * when we can be relatively sure everything is ok.
+ * Set up a idt with 256 interrupt gates that push zero if there
+ * is no error code and then jump to early_idt_handler_common.
+ * It doesn't actually load the idt - that needs to be done on
+ * each CPU. Interrupts are enabled elsewhere, when we can be
+ * relatively sure everything is ok.
*/
movl $idt_table,%edi
- movl $early_idt_handlers,%eax
+ movl $early_idt_handler_array,%eax
movl $NUM_EXCEPTION_VECTORS,%ecx
1:
movl %eax,(%edi)
movl %eax,4(%edi)
/* interrupt gate, dpl=0, present */
movl $(0x8E000000 + __KERNEL_CS),2(%edi)
- addl $9,%eax
+ addl $EARLY_IDT_HANDLER_SIZE,%eax
addl $8,%edi
loop 1b
@@ -523,26 +531,28 @@ setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
ret
-ENTRY(early_idt_handlers)
+ENTRY(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
# 28(%esp) %eip
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handlers)
+ENDPROC(early_idt_handler_array)
- /* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%esp) # X86_TRAP_NMI
@@ -602,7 +612,7 @@ ex_entry:
is_nmi:
addl $8,%esp /* drop vector number and error code */
iret
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..f8a8406033c3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -321,26 +321,28 @@ bad_address:
jmp bad_address
__INIT
- .globl early_idt_handlers
-early_idt_handlers:
+ENTRY(early_idt_handler_array)
# 104(%rsp) %rflags
# 96(%rsp) %cs
# 88(%rsp) %rip
# 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushq $0 # Dummy error code, to make stack frame uniform
.endif
pushq $i # 72(%rsp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
+ENDPROC(early_idt_handler_array)
-/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%rsp) # X86_TRAP_NMI
@@ -412,7 +414,7 @@ ENTRY(early_idt_handler)
is_nmi:
addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
__INITDATA
@@ -514,8 +516,38 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+#ifdef CONFIG_KASAN
+#define FILL(VAL, COUNT) \
+ .rept (COUNT) ; \
+ .quad (VAL) ; \
+ .endr
+
+NEXT_PAGE(kasan_zero_pte)
+ FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pmd)
+ FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pud)
+ FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
+
+#undef FILL
+#endif
+
+
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
+
+#ifdef CONFIG_KASAN
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+NEXT_PAGE(kasan_zero_page)
+ .skip PAGE_SIZE
+#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index ae0fdc86b7b6..8d6e954db2a7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
@@ -195,7 +196,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set_bits(cr4_mask);
cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..792621a32457 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
struct pt_regs *regs = current_pt_regs();
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
@@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 93d2c04c6f8f..a78aa118afc2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -330,13 +330,16 @@ int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int length;
kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
insn_get_length(&insn);
+ length = insn.length;
+
/* Another subsystem puts a breakpoint, failed to recover */
if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
- memcpy(dest, insn.kaddr, insn.length);
+ memcpy(dest, insn.kaddr, length);
#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
@@ -366,7 +369,7 @@ int __copy_instruction(u8 *dest, u8 *src)
*(s32 *) disp = (s32) newdisp;
}
#endif
- return insn.length;
+ return length;
}
static int arch_copy_kprobe(struct kprobe *p)
@@ -928,7 +931,19 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* normal page fault.
*/
regs->ip = (unsigned long)cur->addr;
+ /*
+ * Trap flag (TF) has been set here because this fault
+ * happened where the single stepping will be done.
+ * So clear it by resetting the current kprobe:
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ /*
+ * If the TF flag was set before the kprobe hit,
+ * don't touch it:
+ */
regs->flags |= kcb->kprobe_old_flags;
+
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
else
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c37886d759cc..2bcc0525f1c1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -20,82 +21,82 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
-#ifdef CONFIG_SMP
+/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *current_mm)
{
- if (current->active_mm == current_mm)
- load_LDT(&current->active_mm->context);
+ mm_context_t *pc;
+
+ if (current->active_mm != current_mm)
+ return;
+
+ pc = &current->active_mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->size);
}
-#endif
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(int size)
{
- void *oldldt, *newldt;
- int oldsize;
-
- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+ struct ldt_struct *new_ldt;
+ int alloc_size;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = size * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = vzalloc(alloc_size);
else
- newldt = (void *)__get_free_page(GFP_KERNEL);
-
- if (!newldt)
- return -ENOMEM;
+ new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
- paravirt_alloc_ldt(newldt, mincount);
+ new_ldt->size = size;
+ return new_ldt;
+}
-#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
-
- if (reload) {
-#ifdef CONFIG_SMP
- preempt_disable();
- load_LDT(pc);
- if (!cpumask_equal(mm_cpumask(current->mm),
- cpumask_of(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#else
- load_LDT(pc);
-#endif
- }
- if (oldsize) {
- paravirt_free_ldt(oldldt, oldsize);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
- }
- return 0;
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->size);
}
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+/* context.lock is held */
+static void install_ldt(struct mm_struct *current_mm,
+ struct ldt_struct *ldt)
{
- int err = alloc_ldt(new, old->size, 0);
- int i;
+ /* Synchronizes with lockless_dereference in load_mm_ldt. */
+ smp_store_release(&current_mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using current_mm. */
+ on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+}
- if (err < 0)
- return err;
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
- for (i = 0; i < old->size; i++)
- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
- return 0;
+ paravirt_free_ldt(ldt->entries, ldt->size);
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ kfree(ldt->entries);
+ kfree(ldt);
}
/*
@@ -104,17 +105,37 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
+ struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0;
mutex_init(&mm->context.lock);
- mm->context.size = 0;
old_mm = current->mm;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm) {
+ mm->context.ldt = NULL;
+ return 0;
}
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt) {
+ mm->context.ldt = NULL;
+ goto out_unlock;
+ }
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->size * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}
@@ -125,53 +146,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*/
void destroy_context(struct mm_struct *mm)
{
- if (mm->context.size) {
-#ifdef CONFIG_X86_32
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
-#endif
- paravirt_free_ldt(mm->context.ldt, mm->context.size);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
+ int retval;
unsigned long size;
struct mm_struct *mm = current->mm;
- if (!mm->context.size)
- return 0;
+ mutex_lock(&mm->context.lock);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
+ size = mm->context.ldt->size * LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;
- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
if (size != bytecount) {
- /* zero-fill the rest */
- if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + size, bytecount - size)) {
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ mutex_unlock(&mm->context.lock);
+ return retval;
}
static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -195,6 +210,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
struct desc_struct ldt;
int error;
struct user_desc ldt_info;
+ int oldsize, newsize;
+ struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -213,34 +230,39 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
goto out;
}
- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(&current->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}
- if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
- error = -EINVAL;
+ mutex_lock(&mm->context.lock);
+
+ old_ldt = mm->context.ldt;
+ oldsize = old_ldt ? old_ldt->size : 0;
+ newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(newsize);
+ if (!new_ldt)
goto out_unlock;
- }
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
- /* Install the new entry ... */
-install:
- write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ install_ldt(mm, new_ldt);
+ free_ldt_struct(old_ldt);
error = 0;
out_unlock:
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -83,13 +84,22 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ void *p;
+
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- return __vmalloc_node_range(size, 1,
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
__builtin_return_address(0));
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..a701b49e8c87 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
*
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..8d12f0546dfc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,10 +41,18 @@
#include <asm/timer.h>
#include <asm/special_insns.h>
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
+/*
+ * nop stub, which must not clobber anything *including the stack* to
+ * avoid confusing the entry prologues.
+ */
+extern void _paravirt_nop(void);
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global _paravirt_nop\n"
+ "_paravirt_nop:\n\t"
+ "ret\n\t"
+ ".size _paravirt_nop, . - _paravirt_nop\n\t"
+ ".type _paravirt_nop, @function\n\t"
+ ".popsection");
/* identity function, which can be inlined */
u32 _paravirt_ident_32(u32 x)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6ad8a6396b75..a388bb883128 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -142,7 +143,7 @@ void flush_thread(void)
static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set_bits(X86_CR4_TSD);
}
void disable_TSC(void)
@@ -159,7 +160,7 @@ void disable_TSC(void)
static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear_bits(X86_CR4_TSD);
}
static void enable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 3531e8fdfdc1..ae3ac253f181 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -102,7 +102,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4_safe();
+ cr4 = __read_cr4_safe();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..f547f866e86c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,6 +49,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
asmlinkage extern void ret_from_fork(void);
@@ -93,7 +94,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
@@ -122,11 +123,11 @@ void __show_regs(struct pt_regs *regs, int all)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
- if (dead_task->mm->context.size) {
+ if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
- dead_task->mm->context.size);
+ dead_task->mm->context.ldt->size);
BUG();
}
}
@@ -424,6 +425,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#ifdef CONFIG_XEN
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(xen_pv_domain() &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
return prev_p;
}
@@ -476,27 +488,59 @@ void set_personality_ia32(bool x32)
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
unsigned long get_wchan(struct task_struct *p)
{
- unsigned long stack;
- u64 fp, ip;
+ unsigned long start, bottom, top, sp, fp, ip;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
- stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
+
+ start = (unsigned long)task_stack_page(p);
+ if (!start)
+ return 0;
+
+ /*
+ * Layout of the stack page:
+ *
+ * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+ * PADDING
+ * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+ * stack
+ * ----------- bottom = start + sizeof(thread_info)
+ * thread_info
+ * ----------- start
+ *
+ * The tasks stack pointer points at the location where the
+ * framepointer is stored. The data on the stack is:
+ * ... IP FP ... IP FP
+ *
+ * We need to read FP and IP, so we need to adjust the upper
+ * bound by another unsigned long.
+ */
+ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+ top -= 2 * sizeof(unsigned long);
+ bottom = start + sizeof(struct thread_info);
+
+ sp = READ_ONCE(p->thread.sp);
+ if (sp < bottom || sp > top)
return 0;
- fp = *(u64 *)(p->thread.sp);
+
+ fp = READ_ONCE(*(unsigned long *)sp);
do {
- if (fp < (unsigned long)stack ||
- fp >= (unsigned long)stack+THREAD_SIZE)
+ if (fp < bottom || fp > top)
return 0;
- ip = *(u64 *)(fp+8);
+ ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long)));
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = READ_ONCE(*(unsigned long *)fp);
+ } while (count++ < 16 && p->state != TASK_RUNNING);
return 0;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab08aa2276fb..28a850625672 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/bugs.h>
+#include <asm/kasan.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
@@ -1174,9 +1175,11 @@ void __init setup_arch(char **cmdline_p)
x86_init.paging.pagetable_init();
+ kasan_init();
+
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = read_cr4();
+ mmu_cr4_features = __read_cr4();
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 9b4d51d0c0d0..0ccb53a9fcd9 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/ptrace.h>
#include <asm/desc.h>
+#include <asm/mmu_context.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
@@ -27,13 +28,14 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
struct desc_struct *desc;
unsigned long base;
- seg &= ~7UL;
+ seg >>= 3;
mutex_lock(&child->mm->context.lock);
- if (unlikely((seg >> 3) >= child->mm->context.size))
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->size))
addr = -1L; /* bogus selector, access would fault */
else {
- desc = child->mm->context.ldt + seg;
+ desc = &child->mm->context.ldt->entries[seg];
base = get_desc_base(desc);
/* 16-bit code segment? */
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index b285d4e8c68e..5da924bbf0a0 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -106,14 +106,24 @@ static int __init efifb_set_system(const struct dmi_system_id *id)
continue;
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
resource_size_t start, end;
+ unsigned long flags;
+
+ flags = pci_resource_flags(dev, i);
+ if (!(flags & IORESOURCE_MEM))
+ continue;
+
+ if (flags & IORESOURCE_UNSET)
+ continue;
+
+ if (pci_resource_len(dev, i) == 0)
+ continue;
start = pci_resource_start(dev, i);
- if (start == 0)
- break;
end = pci_resource_end(dev, i);
if (screen_info.lfb_base >= start &&
screen_info.lfb_base < end) {
found_bar = 1;
+ break;
}
}
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..21187ebee7d0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -21,6 +21,7 @@
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/geode.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -1004,15 +1005,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static void __init check_system_tsc_reliable(void)
{
-#ifdef CONFIG_MGEODE_LX
- /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
- unsigned long res_low, res_high;
+ unsigned long res_low, res_high;
- rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a very reliable TSC */
- if (res_low & RTSC_SUSP)
- tsc_clocksource_reliable = 1;
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 92ae6acac8a7..6aa0f4d9eea6 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -92,7 +92,7 @@ unsigned long try_msr_calibrate_tsc(void)
if (freq_desc_tables[cpu_index].msr_plat) {
rdmsr(MSR_PLATFORM_INFO, lo, hi);
- ratio = (lo >> 8) & 0x1f;
+ ratio = (lo >> 8) & 0xff;
} else {
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
ratio = (hi >> 8) & 0x1f;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
#undef memset
#undef memmove
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
extern void *memset(void *, int, __kernel_size_t);
extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
#ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e48b674639cc..234b0722de53 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
.setup_hpet_msi = default_setup_hpet_msi,
- .msi_mask_irq = default_msi_mask_irq,
- .msix_mask_irq = default_msix_mask_irq,
};
/* MSI arch specific hooks */
@@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
{
x86_msi.restore_msi_irqs(dev);
}
-u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
-{
- return x86_msi.msi_mask_irq(desc, mask, flag);
-}
-u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
-{
- return x86_msi.msix_mask_irq(desc, flag);
-}
#endif
struct x86_io_apic_ops x86_io_apic_ops = {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 8be1e1711203..cdc6cf903078 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>
/*
@@ -454,7 +455,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 974e4d98ed29..852572c971c4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -658,7 +658,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
*max_size = 0;
switch (ctxt->mode) {
case X86EMUL_MODE_PROT64:
- if (((signed long)la << 16) >> 16 != la)
+ if (is_noncanonical_address(la))
return emulate_gp(ctxt, 0);
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 298781d4cfb4..b0a706d063cb 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -244,7 +244,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
* PIC is being reset. Handle it gracefully here
*/
atomic_inc(&ps->pending);
- else if (value > 0)
+ else if (value > 0 && ps->reinject)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
*/
@@ -287,7 +287,9 @@ static void pit_do_work(struct kthread_work *work)
* last one has been acked.
*/
spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
+ if (!ps->reinject)
+ inject = 1;
+ else if (ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
@@ -305,7 +307,7 @@ static void pit_do_work(struct kthread_work *work)
* LVT0 to NMI delivery. Other PIC interrupts are just sent to
* VCPU0, and only if its LVT0 is in EXTINT mode.
*/
- if (kvm->arch.vapics_in_nmi_mode > 0)
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_apic_nmi_wd_deliver(vcpu);
}
@@ -316,10 +318,10 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ps->reinject || !atomic_read(&ps->pending)) {
+ if (ps->reinject)
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
- }
+
+ queue_kthread_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ca7b11e8e2d3..33caf685e978 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1146,10 +1146,10 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
if (!nmi_wd_enabled) {
apic_debug("Receive NMI setting on APIC_LVT0 "
"for cpu %d\n", apic->vcpu->vcpu_id);
- apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
}
} else if (nmi_wd_enabled)
- apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
}
static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
@@ -1722,6 +1722,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
+ apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6a11845fd8b9..72051730caf1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -165,7 +165,7 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.apic->pending_events;
+ return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f696dedb0fa7..23875c26fb34 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -372,12 +372,6 @@ static u64 __get_spte_lockless(u64 *sptep)
{
return ACCESS_ONCE(*sptep);
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- /* It is valid if the spte is zapped. */
- return spte == 0ull;
-}
#else
union split_spte {
struct {
@@ -493,23 +487,6 @@ retry:
return spte.spte;
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- union split_spte sspte = (union split_spte)spte;
- u32 high_mmio_mask = shadow_mmio_mask >> 32;
-
- /* It is valid if the spte is zapped. */
- if (spte == 0ull)
- return true;
-
- /* It is valid if the spte is being zapped. */
- if (sspte.spte_low == 0ull &&
- (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
- return true;
-
- return false;
-}
#endif
static bool spte_is_locklessly_modifiable(u64 spte)
@@ -3230,21 +3207,6 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return vcpu_match_mmio_gva(vcpu, addr);
}
-
-/*
- * On direct hosts, the last spte is only allows two states
- * for mmio page fault:
- * - It is the mmio spte
- * - It is zapped or it is being zapped.
- *
- * This function completely checks the spte when the last spte
- * is not the mmio spte.
- */
-static bool check_direct_spte_mmio_pf(u64 spte)
-{
- return __check_direct_spte_mmio_pf(spte);
-}
-
static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
{
struct kvm_shadow_walk_iterator iterator;
@@ -3287,13 +3249,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
/*
- * It's ok if the gva is remapped by other cpus on shadow guest,
- * it's a BUG if the gfn is not a mmio page.
- */
- if (direct && !check_direct_spte_mmio_pf(spte))
- return RET_MMIO_PF_BUG;
-
- /*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe9b5..d537c9badeb6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -257,7 +257,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
return ret;
mark_page_dirty(vcpu->kvm, table_gfn);
- walker->ptes[level] = pte;
+ walker->ptes[level - 1] = pte;
}
return 0;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f7eaee1cbc54..9dc0aa0dae96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -511,8 +511,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->vmcb->control.next_rip != 0)
+ if (svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
+ }
if (!svm->next_rip) {
if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
@@ -1101,6 +1103,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1581,7 +1584,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (cr4 & X86_CR4_VMXE)
@@ -1787,6 +1790,12 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3348,6 +3357,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -4306,7 +4316,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
break;
}
- vmcb->control.next_rip = info->next_rip;
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
vmcb->control.exit_code = icpt_info.exit_code;
vmexit = nested_svm_exit_handled(svm);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 54bda28e6a12..d9c11f3f5b18 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1493,7 +1493,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1600,6 +1600,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
return;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
for (i = 0; i < m->nr; ++i)
@@ -2744,7 +2751,7 @@ static int hardware_enable(void)
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;
- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -2771,7 +2778,7 @@ static int hardware_enable(void)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set_bits(X86_CR4_VMXE);
if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
@@ -2808,7 +2815,7 @@ static void hardware_disable(void)
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -3553,8 +3560,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ unsigned long hw_cr4 =
+ (cr4_read_shadow() & X86_CR4_MCE) |
+ (cr4 & ~X86_CR4_MCE) |
+ (to_vmx(vcpu)->rmode.vm86_active ?
+ KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
if (cr4 & X86_CR4_VMXE) {
/*
@@ -4280,7 +4295,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
/* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
vmx->host_state.vmcs_host_cr4 = cr4;
@@ -4925,6 +4940,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -6722,6 +6740,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7563,7 +7582,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->host_state.vmcs_host_cr4 = cr4;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ebccb44c0af4..bae435bfb1ff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -656,7 +656,6 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
return 1;
- kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
@@ -2065,6 +2064,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ accumulate_steal_time(vcpu);
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -2198,12 +2199,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & KVM_MSR_ENABLED))
break;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
- preempt_disable();
- accumulate_steal_time(vcpu);
- preempt_enable();
-
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
@@ -2903,7 +2898,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
}
- accumulate_steal_time(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
@@ -3117,6 +3111,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
+ if (dbgregs->dr6 & ~0xffffffffull)
+ return -EINVAL;
+ if (dbgregs->dr7 & ~0xffffffffull)
+ return -EINVAL;
+
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = dbgregs->dr6;
kvm_update_dr6(vcpu);
@@ -3673,13 +3672,13 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
@@ -3698,6 +3697,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int r = 0, start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3707,7 +3707,8 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, start);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
}
@@ -6066,12 +6067,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending) {
- if (kvm_x86_ops->nmi_allowed(vcpu)) {
- --vcpu->arch.nmi_pending;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- }
+ if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
/*
* Because interrupts can be injected asynchronously, we are
@@ -6241,10 +6240,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
- else if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
+ else {
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+ }
if (kvm_lapic_enabled(vcpu)) {
/*
@@ -6269,8 +6270,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
- kvm_load_guest_xcr0(vcpu);
-
vcpu->mode = IN_GUEST_MODE;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6293,6 +6292,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
+ kvm_load_guest_xcr0(vcpu);
+
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
@@ -6341,6 +6342,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_put_guest_xcr0(vcpu);
+
/* Interrupt is enabled by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
@@ -6982,7 +6985,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
* and assume host would use all available bits.
* Guest xcr0 would be loaded later.
*/
- kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
fpu_restore_checking(&vcpu->arch.guest_fpu);
@@ -6991,8 +6993,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_xcr0(vcpu);
-
if (!vcpu->guest_fpu_loaded)
return;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 56313a326188..89b53c9968e7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
.Lmemcpy_e_e:
.previous
+.weak memcpy
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
* only outcome...
*/
.section .altinstructions, "a"
- altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
- altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 65268a6104f4..9c4b530575da 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
* Output:
* rax: dest
*/
+.weak memmove
+
ENTRY(memmove)
+ENTRY(__memmove)
CFI_STARTPROC
/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
.Lmemmove_end_forward-.Lmemmove_begin_forward, \
.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
+ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2dcb3808cbda..6f44935c6a60 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
.Lmemset_e_e:
.previous
+.weak memset
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
* feature to implement the right patch order.
*/
.section .altinstructions,"a"
- altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
- .Lfinal-memset,.Lmemset_e-.Lmemset_c
- altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
- .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+ altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
.previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 6a19ad9f370d..9648838b78fa 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
+
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2bb8e7d2882b..bc77c3ce4a9c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
- (read_cr4() & X86_CR4_SMEP))
+ (__read_cr4() & X86_CR4_SMEP))
printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f2343..0a59a63bcdad 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -144,11 +144,11 @@ static void __init probe_page_size_mask(void)
/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
@@ -687,3 +687,11 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..c23ab1ee3a9a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end)
vaddr = start;
pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4e5dfec750fc..fa77995b62a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1144,7 +1144,7 @@ void mark_rodata_ro(void)
* has been zapped already via cleanup_highmem().
*/
all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
- set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
rodata_test();
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000000..23dc7673e110
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,209 @@
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+
+static int __init map_range(struct range *range)
+{
+ unsigned long start;
+ unsigned long end;
+
+ start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+ end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+ /*
+ * end + 1 here is intentional. We check several shadow bytes in advance
+ * to slightly speed up fastpath. In some rare cases we could cross
+ * boundary of mapped shadow, so we just map some more here.
+ */
+ return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+ unsigned long end)
+{
+ for (; start < end; start += PGDIR_SIZE)
+ pgd_clear(pgd_offset_k(start));
+}
+
+void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+ int i;
+ unsigned long start = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+
+ for (i = pgd_index(start); start < end; i++) {
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+ | _KERNPG_TABLE);
+ start += PGDIR_SIZE;
+ }
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ while (addr + PAGE_SIZE <= end) {
+ WARN_ON(!pte_none(*pte));
+ set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+ | __PAGE_KERNEL_RO));
+ addr += PAGE_SIZE;
+ pte = pte_offset_kernel(pmd, addr);
+ }
+ return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ int ret = 0;
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+ WARN_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+ | __PAGE_KERNEL_RO));
+ addr += PMD_SIZE;
+ pmd = pmd_offset(pud, addr);
+ }
+ if (addr < end) {
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pte_populate(pmd, addr, end);
+ }
+ return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+ unsigned long end)
+{
+ int ret = 0;
+ pud_t *pud = pud_offset(pgd, addr);
+
+ while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+ WARN_ON(!pud_none(*pud));
+ set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+ | __PAGE_KERNEL_RO));
+ addr += PUD_SIZE;
+ pud = pud_offset(pgd, addr);
+ }
+
+ if (addr < end) {
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pmd_populate(pud, addr, end);
+ }
+ return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+ int ret = 0;
+ pgd_t *pgd = pgd_offset_k(addr);
+
+ while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+ WARN_ON(!pgd_none(*pgd));
+ set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+ | __PAGE_KERNEL_RO));
+ addr += PGDIR_SIZE;
+ pgd = pgd_offset_k(addr);
+ }
+
+ if (addr < end) {
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pud_populate(pgd, addr, end);
+ }
+ return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+ if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+ panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (val == DIE_GPF) {
+ pr_emerg("CONFIG_KASAN_INLINE enabled");
+ pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+ .notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_init(void)
+{
+ int i;
+
+#ifdef CONFIG_KASAN_INLINE
+ register_die_notifier(&kasan_die_notifier);
+#endif
+
+ memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+ load_cr3(early_level4_pgt);
+
+ clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+ for (i = 0; i < E820_X_MAX; i++) {
+ if (pfn_mapped[i].end == 0)
+ break;
+
+ if (map_range(&pfn_mapped[i]))
+ panic("kasan: unable to allocate shadow!");
+ }
+ kasan_populate_zero_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ NUMA_NO_NODE);
+
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
+
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+
+ load_cr3(init_level4_pgt);
+ init_task.kasan_depth = 0;
+
+ pr_info("KernelAddressSanitizer initialized\n");
+}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34ed632..ddb2244b06a1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
- unsigned long page; /* location of the fault page */
+ unsigned long addr; /* the requested address */
pteval_t old_presence; /* page presence prior to arming */
bool armed;
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
static LIST_HEAD(kmmio_probes);
-static struct list_head *kmmio_page_list(unsigned long page)
+static struct list_head *kmmio_page_list(unsigned long addr)
{
- return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+
+ return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
}
/* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
}
/* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
{
struct list_head *head;
struct kmmio_fault_page *f;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
- page &= PAGE_MASK;
- head = kmmio_page_list(page);
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+ head = kmmio_page_list(addr);
list_for_each_entry_rcu(f, head, list) {
- if (f->page == page)
+ if (f->addr == addr)
return f;
}
return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
- pte_t *pte = lookup_address(f->page, &level);
+ pte_t *pte = lookup_address(f->addr, &level);
if (!pte) {
- pr_err("no pte for page 0x%08lx\n", f->page);
+ pr_err("no pte for addr 0x%08lx\n", f->addr);
return -1;
}
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
- __flush_tlb_one(f->page);
+ __flush_tlb_one(f->addr);
return 0;
}
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
int ret;
WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
- pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
- f->page, f->count, !!f->old_presence);
+ pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
- WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
- f->page);
+ WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+ f->addr);
f->armed = true;
return ret;
}
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
- KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+ KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
f->armed = false;
}
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
int ret = 0; /* default to fault not handled */
+ unsigned long page_base = addr;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+ if (!pte)
+ return -EINVAL;
+ page_base &= page_level_mask(l);
/*
* Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
preempt_disable();
rcu_read_lock();
- faultpage = get_kmmio_fault_page(addr);
+ faultpage = get_kmmio_fault_page(page_base);
if (!faultpage) {
/*
* Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
- if (addr == ctx->addr) {
+ if (page_base == ctx->addr) {
/*
* A second fault on the same page means some other
* condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx->active++;
ctx->fpage = faultpage;
- ctx->probe = get_kmmio_probe(addr);
+ ctx->probe = get_kmmio_probe(page_base);
ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- ctx->addr = addr;
+ ctx->addr = page_base;
if (ctx->probe && ctx->probe->pre_handler)
ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
}
/* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
+static int add_kmmio_fault_page(unsigned long addr)
{
struct kmmio_fault_page *f;
- page &= PAGE_MASK;
- f = get_kmmio_fault_page(page);
+ f = get_kmmio_fault_page(addr);
if (f) {
if (!f->count)
arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
return -1;
f->count = 1;
- f->page = page;
+ f->addr = addr;
if (arm_kmmio_fault_page(f)) {
kfree(f);
return -1;
}
- list_add_rcu(&f->list, kmmio_page_list(f->page));
+ list_add_rcu(&f->list, kmmio_page_list(f->addr));
return 0;
}
/* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
+static void release_kmmio_fault_page(unsigned long addr,
struct kmmio_fault_page **release_list)
{
struct kmmio_fault_page *f;
- page &= PAGE_MASK;
- f = get_kmmio_fault_page(page);
+ f = get_kmmio_fault_page(addr);
if (!f)
return;
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
int ret = 0;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ unsigned int l;
+ pte_t *pte;
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
ret = -EEXIST;
goto out;
}
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte) {
+ ret = -EINVAL;
+ goto out;
+ }
+
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
pr_err("Unable to set page fault.\n");
- size += PAGE_SIZE;
+ size += page_level_size(l);
}
out:
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
+ unsigned int l;
+ pte_t *pte;
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte)
+ return;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
- size += PAGE_SIZE;
+ size += page_level_size(l);
}
list_del_rcu(&p->list);
kmmio_count--;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 36de293caf25..e5545f2105f6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -33,7 +33,7 @@ struct cpa_data {
pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
- int numpages;
+ unsigned long numpages;
int flags;
unsigned long pfn;
unsigned force_split : 1;
@@ -1293,7 +1293,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* CPA operation. Either a large page has been
* preserved or a single page update happened.
*/
- BUG_ON(cpa->numpages > numpages);
+ BUG_ON(cpa->numpages > numpages || !cpa->numpages);
numpages -= cpa->numpages;
if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
cpa->curpage++;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3f627345d51c..82003a36ad96 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -558,6 +558,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (is_ereg(dst_reg))
EMIT1(0x41);
EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
+
+ /* emit 'movzwl eax, ax' */
+ if (is_ereg(dst_reg))
+ EMIT3(0x45, 0x0F, 0xB7);
+ else
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
break;
case 32:
/* emit 'bswap eax' to swap lower 4 bytes */
@@ -576,6 +583,27 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
break;
case BPF_ALU | BPF_END | BPF_FROM_LE:
+ switch (imm32) {
+ case 16:
+ /* emit 'movzwl eax, ax' to zero extend 16-bit
+ * into 64 bit
+ */
+ if (is_ereg(dst_reg))
+ EMIT3(0x45, 0x0F, 0xB7);
+ else
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
+ break;
+ case 32:
+ /* emit 'mov eax, eax' to clear upper 32-bits */
+ if (is_ereg(dst_reg))
+ EMIT1(0x45);
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
+ break;
+ case 64:
+ /* nop */
+ break;
+ }
break;
/* ST: *(u8*)(dst_reg + off) = imm */
@@ -936,7 +964,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
}
ctx.cleanup_addr = proglen;
- for (pass = 0; pass < 10; pass++) {
+ /* JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large bpf programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image
+ */
+ for (pass = 0; pass < 10 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
if (proglen <= 0) {
image = NULL;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index cfd1b132b8e3..a3e94b4108bf 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -84,6 +84,17 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = {
DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
},
},
+ /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/931368 */
+ /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/1033299 */
+ {
+ .callback = set_use_crs,
+ .ident = "Foxconn K8M890-8237A",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Foxconn"),
+ DMI_MATCH(DMI_BOARD_NAME, "K8M890-8237A"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ },
+ },
/* Now for the blacklist.. */
@@ -124,8 +135,10 @@ void __init pci_acpi_crs_quirks(void)
{
int year;
- if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
- pci_use_crs = false;
+ if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) {
+ if (iomem_resource.end <= 0xffffffff)
+ pci_use_crs = false;
+ }
dmi_check_system(pci_crs_quirks);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2fb384724ebb..8fd6f44aee83 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
if (!bus) {
pci_free_resource_list(&resources);
kfree(sd);
+ return;
}
+ pci_bus_add_devices(bus);
}
void __init pcibios_set_cache_line_size(void)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 9a2b7101ae8a..f16af96c60a2 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -553,3 +553,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+static void pci_bdwep_bar(struct pci_dev *dev)
+{
+ dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index b9958c364075..44b9271580b5 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -210,6 +210,9 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
int polarity;
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
+
if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
polarity = 0; /* active high */
else
@@ -224,13 +227,18 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
return -EBUSY;
+ dev->irq_managed = 1;
+
return 0;
}
static void intel_mid_pci_irq_disable(struct pci_dev *dev)
{
- if (!mp_should_keep_irq(&dev->dev) && dev->irq > 0)
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+ dev->irq > 0) {
mp_unmap_irq(dev->irq);
+ dev->irq_managed = 0;
+ }
}
struct pci_ops intel_mid_pci_ops = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index eb500c2592ad..a47e2dea0972 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1202,6 +1202,9 @@ static int pirq_enable_irq(struct pci_dev *dev)
int irq;
struct io_apic_irq_attr irq_attr;
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
+
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
PCI_SLOT(dev->devfn),
pin - 1, &irq_attr);
@@ -1228,6 +1231,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
+ dev->irq_managed = 1;
dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1257,8 +1261,9 @@ static int pirq_enable_irq(struct pci_dev *dev)
static void pirq_disable_irq(struct pci_dev *dev)
{
if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
- dev->irq) {
+ dev->irq_managed && dev->irq) {
mp_unmap_irq(dev->irq);
dev->irq = 0;
+ dev->irq_managed = 0;
}
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 6b3cf7c4e5c2..df8101fa5bd6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -229,7 +229,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return 1;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- __read_msi_msg(msidesc, &msg);
+ __pci_read_msi_msg(msidesc, &msg);
pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
if (msg.data != XEN_PIRQ_MSI_DATA ||
@@ -240,7 +240,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
goto error;
}
xen_msi_compose_msg(dev, pirq, &msg);
- __write_msi_msg(msidesc, &msg);
+ __pci_write_msi_msg(msidesc, &msg);
dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
} else {
dev_dbg(&dev->dev,
@@ -394,14 +394,7 @@ static void xen_teardown_msi_irq(unsigned int irq)
{
xen_destroy_irq(irq);
}
-static u32 xen_nop_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
-{
- return 0;
-}
-static u32 xen_nop_msix_mask_irq(struct msi_desc *desc, u32 flag)
-{
- return 0;
-}
+
#endif
int __init pci_xen_init(void)
@@ -425,8 +418,7 @@ int __init pci_xen_init(void)
x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
- x86_msi.msi_mask_irq = xen_nop_msi_mask_irq;
- x86_msi.msix_mask_irq = xen_nop_msix_mask_irq;
+ pci_msi_ignore_mask = 1;
#endif
return 0;
}
@@ -442,6 +434,7 @@ int __init pci_xen_hvm_init(void)
* just how GSIs get registered.
*/
__acpi_register_gsi = acpi_register_gsi_xen_hvm;
+ __acpi_unregister_gsi = NULL;
#endif
#ifdef CONFIG_PCI_MSI
@@ -460,12 +453,15 @@ int __init pci_xen_initial_domain(void)
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
- x86_msi.msi_mask_irq = xen_nop_msi_mask_irq;
- x86_msi.msix_mask_irq = xen_nop_msix_mask_irq;
+ pci_msi_ignore_mask = 1;
#endif
__acpi_register_gsi = acpi_register_gsi_xen;
- /* Pre-allocate legacy irqs */
- for (irq = 0; irq < nr_legacy_irqs(); irq++) {
+ __acpi_unregister_gsi = NULL;
+ /*
+ * Pre-allocate the legacy IRQs. Use NR_LEGACY_IRQS here
+ * because we don't have a PIC and thus nr_legacy_irqs() is zero.
+ */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
int trigger, polarity;
if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index dbc8627a5cdf..6d6080f3fa35 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -670,6 +670,70 @@ out:
}
/*
+ * Iterate the EFI memory map in reverse order because the regions
+ * will be mapped top-down. The end result is the same as if we had
+ * mapped things forward, but doesn't require us to change the
+ * existing implementation of efi_map_region().
+ */
+static inline void *efi_map_next_entry_reverse(void *entry)
+{
+ /* Initial call */
+ if (!entry)
+ return memmap.map_end - memmap.desc_size;
+
+ entry -= memmap.desc_size;
+ if (entry < memmap.map)
+ return NULL;
+
+ return entry;
+}
+
+/*
+ * efi_map_next_entry - Return the next EFI memory map descriptor
+ * @entry: Previous EFI memory map descriptor
+ *
+ * This is a helper function to iterate over the EFI memory map, which
+ * we do in different orders depending on the current configuration.
+ *
+ * To begin traversing the memory map @entry must be %NULL.
+ *
+ * Returns %NULL when we reach the end of the memory map.
+ */
+static void *efi_map_next_entry(void *entry)
+{
+ if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
+ /*
+ * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
+ * config table feature requires us to map all entries
+ * in the same order as they appear in the EFI memory
+ * map. That is to say, entry N must have a lower
+ * virtual address than entry N+1. This is because the
+ * firmware toolchain leaves relative references in
+ * the code/data sections, which are split and become
+ * separate EFI memory regions. Mapping things
+ * out-of-order leads to the firmware accessing
+ * unmapped addresses.
+ *
+ * Since we need to map things this way whether or not
+ * the kernel actually makes use of
+ * EFI_PROPERTIES_TABLE, let's just switch to this
+ * scheme by default for 64-bit.
+ */
+ return efi_map_next_entry_reverse(entry);
+ }
+
+ /* Initial call */
+ if (!entry)
+ return memmap.map;
+
+ entry += memmap.desc_size;
+ if (entry >= memmap.map_end)
+ return NULL;
+
+ return entry;
+}
+
+/*
* Map the efi memory ranges of the runtime services and update new_mmap with
* virtual addresses.
*/
@@ -679,7 +743,8 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
unsigned long left = 0;
efi_memory_desc_t *md;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ p = NULL;
+ while ((p = efi_map_next_entry(p))) {
md = p;
if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
#ifdef CONFIG_X86_64
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6ec7910f59bf..a13a38830e76 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
#include <asm/debugreg.h>
#include <asm/fpu-internal.h> /* pcntxt_mask */
#include <asm/cpu.h>
+#include <asm/mmu_context.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -105,11 +106,8 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
- ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
- ctxt->cr4 = read_cr4();
+ ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
#endif
ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
@@ -157,7 +155,7 @@ static void fix_processor_context(void)
syscall_init(); /* This sets MSR_*STAR and related */
#endif
load_TR_desc(); /* This does ltr */
- load_LDT(&current->active_mm->context); /* This does lldt */
+ load_mm_ldt(current->active_mm); /* This does lldt */
}
/**
@@ -175,12 +173,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
/* cr4 was introduced in the Pentium CPU */
#ifdef CONFIG_X86_32
if (ctxt->cr4)
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#else
/* CONFIG X86_64 */
wrmsrl(MSR_EFER, ctxt->efer);
write_cr8(ctxt->cr8);
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#endif
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 94f7fbe97b08..e02c2c6c56a5 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,7 @@
# for more details.
#
#
-
+KASAN_SANITIZE := n
subdir- := rm
obj-y += init.o
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index bad628a620c4..0b7a63d98440 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)
trampoline_header->start = (u64) secondary_startup_64;
trampoline_cr4_features = &trampoline_header->cr4;
- *trampoline_cr4_features = read_cr4();
+ *trampoline_cr4_features = __read_cr4();
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 7c0d7be176a5..2730d775ef9a 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,6 +6,7 @@
# for more details.
#
#
+KASAN_SANITIZE := n
always := realmode.bin realmode.relocs
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 5a4affe025e8..2aacd7c63c7b 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -3,6 +3,7 @@
#
KBUILD_CFLAGS += $(DISABLE_LTO)
+KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index e88fda867a33..484145368a24 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -8,7 +8,7 @@ config XEN
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
depends on X86_64 || (X86_32 && X86_PAE)
- depends on X86_TSC
+ depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@@ -17,7 +17,7 @@ config XEN
config XEN_DOM0
def_bool y
depends on XEN && PCI_XEN && SWIOTLB_XEN
- depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+ depends on X86_IO_APIC && ACPI && PCI
config XEN_PVHVM
def_bool y
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7322755f337a..4b6e29ac0968 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,13 +13,13 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
- p2m.o
+ p2m.o apic.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fac5e4f9607c..6ba1ec961aaa 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,10 @@
#include <linux/memblock.h>
#include <linux/edd.h>
+#ifdef CONFIG_KEXEC_CORE
+#include <linux/kexec.h>
+#endif
+
#include <xen/xen.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
@@ -481,6 +485,7 @@ static void set_aliased_prot(void *v, pgprot_t prot)
pte_t pte;
unsigned long pfn;
struct page *page;
+ unsigned char dummy;
ptep = lookup_address((unsigned long)v, &level);
BUG_ON(ptep == NULL);
@@ -490,6 +495,32 @@ static void set_aliased_prot(void *v, pgprot_t prot)
pte = pfn_pte(pfn, prot);
+ /*
+ * Careful: update_va_mapping() will fail if the virtual address
+ * we're poking isn't populated in the page tables. We don't
+ * need to worry about the direct map (that's always in the page
+ * tables), but we need to be careful about vmap space. In
+ * particular, the top level page table can lazily propagate
+ * entries between processes, so if we've switched mms since we
+ * vmapped the target in the first place, we might not have the
+ * top-level page table entry populated.
+ *
+ * We disable preemption because we want the same mm active when
+ * we probe the target and when we issue the hypercall. We'll
+ * have the same nominal mm, but if we're a kernel thread, lazy
+ * mm dropping could change our pgd.
+ *
+ * Out of an abundance of caution, this uses __get_user() to fault
+ * in the target address just in case there's some obscure case
+ * in which the target address isn't readable.
+ */
+
+ preempt_disable();
+
+ pagefault_disable(); /* Avoid warnings due to being atomic. */
+ __get_user(dummy, (unsigned char __user __force *)v);
+ pagefault_enable();
+
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();
@@ -501,6 +532,8 @@ static void set_aliased_prot(void *v, pgprot_t prot)
BUG();
} else
kmap_flush_unused();
+
+ preempt_enable();
}
static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
@@ -508,6 +541,17 @@ static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
+ /*
+ * We need to mark the all aliases of the LDT pages RO. We
+ * don't need to call vm_flush_aliases(), though, since that's
+ * only responsible for flushing aliases out the TLBs, not the
+ * page tables, and Xen will flush the TLB for us if needed.
+ *
+ * To avoid confusing future readers: none of this is necessary
+ * to load the LDT. The hypervisor only checks this when the
+ * LDT is faulted in due to subsequent descriptor access.
+ */
+
for(i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
@@ -912,7 +956,7 @@ static void xen_load_sp0(struct tss_struct *tss,
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
@@ -1483,10 +1527,10 @@ static void xen_pvh_set_cr_flags(int cpu)
* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
}
/*
@@ -1732,6 +1776,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
#ifdef CONFIG_X86_32
i386_start_kernel();
#else
+ cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
@@ -1818,6 +1863,21 @@ static struct notifier_block xen_hvm_cpu_notifier = {
.notifier_call = xen_hvm_cpu_notify,
};
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+ native_machine_shutdown();
+ if (kexec_in_progress)
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_machine_crash_shutdown(regs);
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
static void __init xen_hvm_guest_init(void)
{
init_hvm_pv_info();
@@ -1834,6 +1894,10 @@ static void __init xen_hvm_guest_init(void)
x86_init.irqs.intr_init = xen_init_IRQ;
xen_hvm_init_time_ops();
xen_hvm_init_mmu_ops();
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.shutdown = xen_hvm_shutdown;
+ machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
}
static bool xen_nopv = false;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 28c7e0be56e4..566004cc8a5b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -94,17 +94,15 @@ struct dom0_vga_console_info;
#ifdef CONFIG_XEN_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
-void __init xen_init_apic(void);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
size_t size)
{
}
-static inline void __init xen_init_apic(void)
-{
-}
#endif
+void __init xen_init_apic(void);
+
#ifdef CONFIG_XEN_EFI
extern void xen_efi_init(void);
#else