aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/entry_64.S163
-rw-r--r--arch/x86/entry/entry_64_compat.S8
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c5
3 files changed, 157 insertions, 19 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 65916d49dbc9..22803e2f7495 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
+#include <asm/kaiser.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
@@ -323,10 +333,26 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -478,6 +505,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -535,6 +563,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -628,6 +657,7 @@ native_irq_return_ldt:
pushq %rdi /* Stash user RDI */
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -654,6 +684,7 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
@@ -1040,7 +1071,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
*/
ENTRY(paranoid_entry)
cld
@@ -1053,7 +1088,26 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+1:
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+2:
+#endif
+ ret
END(paranoid_entry)
/*
@@ -1066,19 +1120,26 @@ END(paranoid_entry)
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
*/
ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx, %ebx /* swapgs needed? */
+ TRACE_IRQS_IRETQ_DEBUG
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1093,6 +1154,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1253,6 +1321,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ /*
+ * percpu variables are mapped with user CR3, so no need
+ * to switch CR3 here.
+ */
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1286,12 +1358,34 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
call do_nmi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Unconditionally restore CR3. I know we return to
+ * kernel code that needs user CR3, but do we ever return
+ * to "user mode" where we need the kernel CR3?
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts. Do not
+ * switch to user CR3: we might be going back to kernel code
+ * that had a user CR3 set.
*/
SWAPGS
jmp restore_c_regs_and_iret
@@ -1488,22 +1582,55 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
- * as we should not be calling schedule in NMI context.
- * Even with normal interrupts enabled. An NMI should not be
- * setting NEED_RESCHED or anything that normal interrupts and
- * exceptions might do.
+ * Use the same approach as paranoid_entry to handle SWAPGS, but
+ * without CR3 handling since we do that differently in NMIs. No
+ * need to use paranoid_exit as we should not be calling schedule
+ * in NMI context. Even with normal interrupts enabled. An NMI
+ * should not be setting NEED_RESCHED or anything that normal
+ * interrupts and exceptions might do.
*/
- call paranoid_entry
-
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ cld
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+1:
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Unconditionally restore CR3. We might be returning to
+ * kernel code that needs user CR3, like just just before
+ * a sysret.
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ /* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..d76a97653980 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,8 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -48,6 +50,7 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -259,6 +263,7 @@ sysret32_from_system_call:
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
+ SWITCH_USER_CR3
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
-
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
* the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 636c4b341f36..6bb7e92c6d50 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -66,6 +66,11 @@ static int __init vsyscall_setup(char *str)
}
early_param("vsyscall", vsyscall_setup);
+bool vsyscall_enabled(void)
+{
+ return vsyscall_mode != NONE;
+}
+
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{