aboutsummaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/mach-omap2/clockdomains7xx_data.c2
-rw-r--r--arch/arm64/Kconfig20
-rw-r--r--arch/arm64/Makefile4
-rw-r--r--arch/arm64/kernel/head.S5
-rw-r--r--arch/arm64/kernel/module.c2
-rw-r--r--arch/arm64/kernel/signal32.c47
-rw-r--r--arch/arm64/kvm/hyp.S5
-rw-r--r--arch/parisc/kernel/irq.c8
-rw-r--r--arch/parisc/kernel/syscall.S2
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h14
-rw-r--r--arch/powerpc/include/asm/rtas.h1
-rw-r--r--arch/powerpc/kernel/rtas.c17
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c3
-rw-r--r--arch/powerpc/platforms/pseries/ras.c3
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/kernel/entry_64.S296
-rw-r--r--arch/x86/kernel/nmi.c123
-rw-r--r--arch/x86/kvm/mmu.c45
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/xtensa/include/asm/traps.h29
-rw-r--r--arch/xtensa/kernel/entry.S7
21 files changed, 387 insertions, 248 deletions
diff --git a/arch/arm/mach-omap2/clockdomains7xx_data.c b/arch/arm/mach-omap2/clockdomains7xx_data.c
index 57d5df0c1fbd..7581e036bda6 100644
--- a/arch/arm/mach-omap2/clockdomains7xx_data.c
+++ b/arch/arm/mach-omap2/clockdomains7xx_data.c
@@ -331,7 +331,7 @@ static struct clockdomain l4per2_7xx_clkdm = {
.dep_bit = DRA7XX_L4PER2_STATDEP_SHIFT,
.wkdep_srcs = l4per2_wkup_sleep_deps,
.sleepdep_srcs = l4per2_wkup_sleep_deps,
- .flags = CLKDM_CAN_HWSUP_SWSUP,
+ .flags = CLKDM_CAN_SWSUP,
};
static struct clockdomain mpu0_7xx_clkdm = {
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b0424cf0fa4f..7340e1f4e183 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -84,6 +84,10 @@ config NO_IOPORT
config STACKTRACE_SUPPORT
def_bool y
+config ILLEGAL_POINTER_VALUE
+ hex
+ default 0xdead000000000000
+
config LOCKDEP_SUPPORT
def_bool y
@@ -489,6 +493,22 @@ menu "CPU Power Management"
source "drivers/cpuidle/Kconfig"
+config ARM64_ERRATUM_843419
+ bool "Cortex-A53: 843419: A load or store might access an incorrect address"
+ depends on MODULES
+ default y
+ help
+ This option builds kernel modules using the large memory model in
+ order to avoid the use of the ADRP instruction, which can cause
+ a subsequent memory access to use an incorrect address on Cortex-A53
+ parts up to r0p4.
+
+ Note that the kernel itself must be linked with a version of ld
+ which fixes potentially affected ADRP instructions through the
+ use of veneers.
+
+ If unsure, say Y.
+
endmenu
source "net/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 8185a913c5ed..be8b36304ac1 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -34,6 +34,10 @@ comma = ,
CHECKFLAGS += -D__aarch64__
+ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+CFLAGS_MODULE += -mcmodel=large
+endif
+
# Default value
head-y := arch/arm64/kernel/head.o
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index e28eaef0568f..c295a20bb98e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -322,6 +322,11 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems
msr hstr_el2, xzr // Disable CP15 traps to EL2
#endif
+ /* EL2 debug */
+ mrs x0, pmcr_el0 // Disable debug access traps
+ ubfx x0, x0, #11, #5 // to EL2 and allow access to
+ msr mdcr_el2, x0 // all PMU counters from EL1
+
/* Stage-2 translation */
msr vttbr_el2, xzr
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 1eb1cc955139..e366329d96d8 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -330,12 +330,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
AARCH64_INSN_IMM_ADR);
break;
+#ifndef CONFIG_ARM64_ERRATUM_843419
case R_AARCH64_ADR_PREL_PG_HI21_NC:
overflow_check = false;
case R_AARCH64_ADR_PREL_PG_HI21:
ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21,
AARCH64_INSN_IMM_ADR);
break;
+#endif
case R_AARCH64_ADD_ABS_LO12_NC:
case R_AARCH64_LDST8_ABS_LO12_NC:
overflow_check = false;
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e6c4085d2ec0..9ebdaff7e046 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -203,14 +203,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
/*
* VFP save/restore code.
+ *
+ * We have to be careful with endianness, since the fpsimd context-switch
+ * code operates on 128-bit (Q) register values whereas the compat ABI
+ * uses an array of 64-bit (D) registers. Consequently, we need to swap
+ * the two halves of each Q register when running on a big-endian CPU.
*/
+union __fpsimd_vreg {
+ __uint128_t raw;
+ struct {
+#ifdef __AARCH64EB__
+ u64 hi;
+ u64 lo;
+#else
+ u64 lo;
+ u64 hi;
+#endif
+ };
+};
+
static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
{
struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
compat_ulong_t magic = VFP_MAGIC;
compat_ulong_t size = VFP_STORAGE_SIZE;
compat_ulong_t fpscr, fpexc;
- int err = 0;
+ int i, err = 0;
/*
* Save the hardware registers to the fpsimd_state structure.
@@ -226,10 +244,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
/*
* Now copy the FP registers. Since the registers are packed,
* we can copy the prefix we want (V0-V15) as it is.
- * FIXME: Won't work if big endian.
*/
- err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs,
- sizeof(frame->ufp.fpregs));
+ for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+ union __fpsimd_vreg vreg = {
+ .raw = fpsimd->vregs[i >> 1],
+ };
+
+ __put_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+ __put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+ }
/* Create an AArch32 fpscr from the fpsr and the fpcr. */
fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) |
@@ -254,7 +277,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
compat_ulong_t magic = VFP_MAGIC;
compat_ulong_t size = VFP_STORAGE_SIZE;
compat_ulong_t fpscr;
- int err = 0;
+ int i, err = 0;
__get_user_error(magic, &frame->magic, err);
__get_user_error(size, &frame->size, err);
@@ -264,12 +287,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE)
return -EINVAL;
- /*
- * Copy the FP registers into the start of the fpsimd_state.
- * FIXME: Won't work if big endian.
- */
- err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs,
- sizeof(frame->ufp.fpregs));
+ /* Copy the FP registers into the start of the fpsimd_state. */
+ for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+ union __fpsimd_vreg vreg;
+
+ __get_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+ __get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+ fpsimd.vregs[i >> 1] = vreg.raw;
+ }
/* Extract the fpsr and the fpcr from the fpscr */
__get_user_error(fpscr, &frame->ufp.fpscr, err);
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index a767f6a4ce54..566a457d1803 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -843,8 +843,6 @@
mrs x3, cntv_ctl_el0
and x3, x3, #3
str w3, [x0, #VCPU_TIMER_CNTV_CTL]
- bic x3, x3, #1 // Clear Enable
- msr cntv_ctl_el0, x3
isb
@@ -852,6 +850,9 @@
str x3, [x0, #VCPU_TIMER_CNTV_CVAL]
1:
+ // Disable the virtual timer
+ msr cntv_ctl_el0, xzr
+
// Allow physical timer/counter access for the host
mrs x2, cnthctl_el2
orr x2, x2, #3
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 8ceac4785609..1ce320e4d3d8 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -507,8 +507,8 @@ void do_cpu_irq_mask(struct pt_regs *regs)
struct pt_regs *old_regs;
unsigned long eirr_val;
int irq, cpu = smp_processor_id();
-#ifdef CONFIG_SMP
struct irq_desc *desc;
+#ifdef CONFIG_SMP
cpumask_t dest;
#endif
@@ -521,8 +521,12 @@ void do_cpu_irq_mask(struct pt_regs *regs)
goto set_out;
irq = eirr_to_irq(eirr_val);
-#ifdef CONFIG_SMP
+ /* Filter out spurious interrupts, mostly from serial port at bootup */
desc = irq_to_desc(irq);
+ if (unlikely(!desc->action))
+ goto set_out;
+
+#ifdef CONFIG_SMP
cpumask_copy(&dest, desc->irq_data.affinity);
if (irqd_is_per_cpu(&desc->irq_data) &&
!cpu_isset(smp_processor_id(), dest)) {
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3387e0..0b8d26d3ba43 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -821,7 +821,7 @@ cas2_action:
/* 64bit CAS */
#ifdef CONFIG_64BIT
19: ldd,ma 0(%sr3,%r26), %r29
- sub,= %r29, %r25, %r0
+ sub,*= %r29, %r25, %r0
b,n cas2_end
20: std,ma %r24, 0(%sr3,%r26)
copy %r0, %r28
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 7b3d54fae46f..7356053b1133 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -135,7 +135,19 @@
#define pte_iterate_hashed_end() } while(0)
#ifdef CONFIG_PPC_HAS_HASH_64K
-#define pte_pagesize_index(mm, addr, pte) get_slice_psize(mm, addr)
+/*
+ * We expect this to be called only for user addresses or kernel virtual
+ * addresses other than the linear mapping.
+ */
+#define pte_pagesize_index(mm, addr, pte) \
+ ({ \
+ unsigned int psize; \
+ if (is_kernel_addr(addr)) \
+ psize = MMU_PAGE_4K; \
+ else \
+ psize = get_slice_psize(mm, addr); \
+ psize; \
+ })
#else
#define pte_pagesize_index(mm, addr, pte) MMU_PAGE_4K
#endif
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 9bd52c65e66f..14de1385dedb 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -255,6 +255,7 @@ extern void rtas_power_off(void);
extern void rtas_halt(void);
extern void rtas_os_term(char *str);
extern int rtas_get_sensor(int sensor, int index, int *state);
+extern int rtas_get_sensor_fast(int sensor, int index, int *state);
extern int rtas_get_power_level(int powerdomain, int *level);
extern int rtas_set_power_level(int powerdomain, int level, int *setlevel);
extern bool rtas_indicator_present(int token, int *maxindex);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 8c923bac430f..4448bc94b361 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -584,6 +584,23 @@ int rtas_get_sensor(int sensor, int index, int *state)
}
EXPORT_SYMBOL(rtas_get_sensor);
+int rtas_get_sensor_fast(int sensor, int index, int *state)
+{
+ int token = rtas_token("get-sensor-state");
+ int rc;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ return -ENOENT;
+
+ rc = rtas_call(token, 2, 2, state, sensor, index);
+ WARN_ON(rc == RTAS_BUSY || (rc >= RTAS_EXTENDED_DELAY_MIN &&
+ rc <= RTAS_EXTENDED_DELAY_MAX));
+
+ if (rc < 0)
+ return rtas_error_rc(rc);
+ return rc;
+}
+
bool rtas_indicator_present(int token, int *maxindex)
{
int proplen, count, i;
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 5f5e6328c21c..5061c6f676da 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -136,7 +136,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
BUG_ON(index >= 4096);
vpn = hpt_vpn(ea, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
hpte_slot_array = get_hpte_slot_array(pmdp);
if (psize == MMU_PAGE_4K) {
/*
@@ -151,6 +150,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
valid = hpte_valid(hpte_slot_array, index);
if (valid) {
/* update the hpte bits */
+ hash = hpt_hash(vpn, shift, ssize);
hidx = hpte_hash_index(hpte_slot_array, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
@@ -176,6 +176,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
if (!valid) {
unsigned long hpte_group;
+ hash = hpt_hash(vpn, shift, ssize);
/* insert new entry */
pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
new_pmd |= _PAGE_HASHPTE;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 721c0586b284..50fd3ac7b7bf 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -187,7 +187,8 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
int state;
int critical;
- status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
+ status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
+ &state);
if (state > 3)
critical = 1; /* Time Critical */
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index a8d6f69f92a3..4bcf841e4701 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -291,6 +291,7 @@ static struct ahash_alg ghash_async_alg = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
.cra_priority = 400,
+ .cra_ctxsize = sizeof(struct ghash_async_ctx),
.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_type = &crypto_ahash_type,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 06469ee0f26e..6d6ab2b0bdfa 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1702,11 +1702,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1715,52 +1716,184 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
CFI_REL_OFFSET rdx, 0
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ */
+ SWAPGS
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(kernel_stack), %rsp
+ addq $KERNEL_STACK_OFFSET, %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+
+ /*
+ * Open-code the entire return process for compatibility with varying
+ * register layouts across different kernel versions.
+ */
+ addq $6*8, %rsp /* skip bx, bp, and r12-r15 */
+ popq %r11 /* pt_regs->r11 */
+ popq %r10 /* pt_regs->r10 */
+ popq %r9 /* pt_regs->r9 */
+ popq %r8 /* pt_regs->r8 */
+ popq %rax /* pt_regs->ax */
+ popq %rcx /* pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+ addq $8, %rsp /* skip orig_ax */
+ INTERRUPT_RETURN
+
+.Lnmi_from_kernel:
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
+
+ /*
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
+
CFI_REMEMBER_STATE
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 1*8
@@ -1779,60 +1912,23 @@ nested_nmi_out:
popq_cfi %rdx
CFI_RESTORE rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
CFI_RESTORE_STATE
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
CFI_RESTORE rdx
- /* Set the NMI executing variable on the stack. */
+ /* Set "NMI executing" on the stack. */
pushq_cfi $1
- /*
- * Leave room for the "copied" frame
- */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
CFI_ADJUST_CFA_OFFSET 5*8
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq_cfi 11*8(%rsp)
.endr
@@ -1840,6 +1936,7 @@ first_nmi:
/* Everything up to here is safe from nested NMIs */
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1848,16 +1945,21 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
- */
-repeat_nmi:
- /*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi.
+ *
+ * Set "NMI executing" in case we came back here via IRET.
*/
movq $1, 10*8(%rsp)
- /* Make another copy, this one may be modified by nested NMIs */
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
addq $(10*8), %rsp
CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
@@ -1868,9 +1970,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
@@ -1885,28 +1987,10 @@ end_repeat_nmi:
call save_paranoid
DEFAULT_FRAME 0
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
@@ -1916,9 +2000,23 @@ nmi_restore:
/* Pop the extra iret frame at once */
RESTORE_ALL 6*8
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
- jmp irq_return
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
+ INTERRUPT_RETURN
CFI_ENDPROC
END(nmi)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49ce50a1..8facfb318a97 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -392,15 +392,15 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
}
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -414,15 +414,14 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -445,49 +444,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
*
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -498,22 +484,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -523,8 +495,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
void stop_nmi(void)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5ba31d053088..07b9861adafa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -376,12 +376,6 @@ static u64 __get_spte_lockless(u64 *sptep)
{
return ACCESS_ONCE(*sptep);
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- /* It is valid if the spte is zapped. */
- return spte == 0ull;
-}
#else
union split_spte {
struct {
@@ -497,23 +491,6 @@ retry:
return spte.spte;
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- union split_spte sspte = (union split_spte)spte;
- u32 high_mmio_mask = shadow_mmio_mask >> 32;
-
- /* It is valid if the spte is zapped. */
- if (spte == 0ull)
- return true;
-
- /* It is valid if the spte is being zapped. */
- if (sspte.spte_low == 0ull &&
- (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
- return true;
-
- return false;
-}
#endif
static bool spte_is_locklessly_modifiable(u64 spte)
@@ -3210,21 +3187,6 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return vcpu_match_mmio_gva(vcpu, addr);
}
-
-/*
- * On direct hosts, the last spte is only allows two states
- * for mmio page fault:
- * - It is the mmio spte
- * - It is zapped or it is being zapped.
- *
- * This function completely checks the spte when the last spte
- * is not the mmio spte.
- */
-static bool check_direct_spte_mmio_pf(u64 spte)
-{
- return __check_direct_spte_mmio_pf(spte);
-}
-
static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
{
struct kvm_shadow_walk_iterator iterator;
@@ -3267,13 +3229,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
/*
- * It's ok if the gva is remapped by other cpus on shadow guest,
- * it's a BUG if the gfn is not a mmio page.
- */
- if (direct && !check_direct_spte_mmio_pf(spte))
- return RET_MMIO_PF_BUG;
-
- /*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e39504878aec..a7b5b3071072 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end)
vaddr = start;
pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h
index 677bfcf4ee5d..28f33a8b7f5f 100644
--- a/arch/xtensa/include/asm/traps.h
+++ b/arch/xtensa/include/asm/traps.h
@@ -25,30 +25,39 @@ static inline void spill_registers(void)
{
#if XCHAL_NUM_AREGS > 16
__asm__ __volatile__ (
- " call12 1f\n"
+ " call8 1f\n"
" _j 2f\n"
" retw\n"
" .align 4\n"
"1:\n"
+#if XCHAL_NUM_AREGS == 32
+ " _entry a1, 32\n"
+ " addi a8, a0, 3\n"
+ " _entry a1, 16\n"
+ " mov a12, a12\n"
+ " retw\n"
+#else
" _entry a1, 48\n"
- " addi a12, a0, 3\n"
-#if XCHAL_NUM_AREGS > 32
- " .rept (" __stringify(XCHAL_NUM_AREGS) " - 32) / 12\n"
+ " call12 1f\n"
+ " retw\n"
+ " .align 4\n"
+ "1:\n"
+ " .rept (" __stringify(XCHAL_NUM_AREGS) " - 16) / 12\n"
" _entry a1, 48\n"
" mov a12, a0\n"
" .endr\n"
-#endif
- " _entry a1, 48\n"
+ " _entry a1, 16\n"
#if XCHAL_NUM_AREGS % 12 == 0
- " mov a8, a8\n"
-#elif XCHAL_NUM_AREGS % 12 == 4
" mov a12, a12\n"
-#elif XCHAL_NUM_AREGS % 12 == 8
+#elif XCHAL_NUM_AREGS % 12 == 4
" mov a4, a4\n"
+#elif XCHAL_NUM_AREGS % 12 == 8
+ " mov a8, a8\n"
#endif
" retw\n"
+#endif
"2:\n"
- : : : "a12", "a13", "memory");
+ : : : "a8", "a9", "memory");
#else
__asm__ __volatile__ (
" mov a12, a12\n"
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index a06b7efaae82..cf8a354fa628 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -568,12 +568,13 @@ user_exception_exit:
* (if we have restored WSBITS-1 frames).
*/
+2:
#if XCHAL_HAVE_THREADPTR
l32i a3, a1, PT_THREADPTR
wur a3, threadptr
#endif
-2: j common_exception_exit
+ j common_exception_exit
/* This is the kernel exception exit.
* We avoided to do a MOVSP when we entered the exception, but we
@@ -1792,7 +1793,7 @@ ENDPROC(system_call)
mov a12, a0
.endr
#endif
- _entry a1, 48
+ _entry a1, 16
#if XCHAL_NUM_AREGS % 12 == 0
mov a8, a8
#elif XCHAL_NUM_AREGS % 12 == 4
@@ -1816,7 +1817,7 @@ ENDPROC(system_call)
ENTRY(_switch_to)
- entry a1, 16
+ entry a1, 48
mov a11, a3 # and 'next' (a3)