aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/Makefile4
-rw-r--r--arch/x86/entry/entry_32.S1249
-rw-r--r--arch/x86/entry/entry_64.S1442
3 files changed, 2695 insertions, 0 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000000..fa7e0cf6d3c4
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,4 @@
+#
+# Makefile for the x86 low level entry code
+#
+obj-y := entry_$(BITS).o
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000000..0ac73de925d1
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1249 @@
+/*
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ * ptrace needs to have all regs on the stack.
+ * if the order here is changed, it needs to be
+ * updated in fork.c:copy_process, signal.c:do_signal,
+ * ptrace.c and ptrace.h
+ *
+ * 0(%esp) - %ebx
+ * 4(%esp) - %ecx
+ * 8(%esp) - %edx
+ * C(%esp) - %esi
+ * 10(%esp) - %edi
+ * 14(%esp) - %ebp
+ * 18(%esp) - %eax
+ * 1C(%esp) - %ds
+ * 20(%esp) - %es
+ * 24(%esp) - %fs
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit syscall_trace_entry
+#define sysexit_audit syscall_exit_work
+#endif
+
+ .section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
+ jz 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ .if \pop <> 0
+ add $\pop, %esp
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b,99b)
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b,99b)
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
+ movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+.endm
+
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+2: popl %es
+3: popl %fs
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.popsection
+ _ASM_EXTABLE(1b,4b)
+ _ASM_EXTABLE(2b,5b)
+ _ASM_EXTABLE(3b,6b)
+ POP_GS_EX
+.endm
+
+ENTRY(ret_from_fork)
+ pushl %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ jmp syscall_exit
+END(ret_from_fork)
+
+ENTRY(ret_from_kernel_thread)
+ pushl %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ movl PT_EBP(%esp),%eax
+ call *PT_EBX(%esp)
+ movl $0,PT_EAX(%esp)
+ jmp syscall_exit
+ENDPROC(ret_from_kernel_thread)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ret_from_exception:
+ preempt_stop(CLBR_ANY)
+ret_from_intr:
+ GET_THREAD_INFO(%ebp)
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from child spawned by kernel_thread().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
+ # int/exception return?
+ jne work_pending
+ jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+need_resched:
+ cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz restore_all
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz restore_all
+ call preempt_schedule_irq
+ jmp need_resched
+END(resume_kernel)
+#endif
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
+
+ # sysenter call handler stub
+ENTRY(ia32_sysenter_target)
+ movl TSS_sysenter_sp0(%esp),%esp
+sysenter_past_esp:
+ /*
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
+ */
+ pushl $__USER_DS
+ pushl %ebp
+ pushfl
+ orl $X86_EFLAGS_IF, (%esp)
+ pushl $__USER_CS
+ /*
+ * Push current_thread_info()->sysenter_return to the stack.
+ * A tiny bit of offset fixup is necessary: TI_sysenter_return
+ * is relative to thread_info, which is at the bottom of the
+ * kernel stack page. 4*4 means the 4 words pushed above;
+ * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+ * and THREAD_SIZE takes us to the bottom.
+ */
+ pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
+
+ pushl %eax
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+ cmpl $__PAGE_OFFSET-3,%ebp
+ jae syscall_fault
+ ASM_STAC
+1: movl (%ebp),%ebp
+ ASM_CLAC
+ movl %ebp,PT_EBP(%esp)
+ _ASM_EXTABLE(1b,syscall_fault)
+
+ GET_THREAD_INFO(%ebp)
+
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ jnz sysenter_audit
+sysenter_do_call:
+ cmpl $(NR_syscalls), %eax
+ jae sysenter_badsys
+ call *sys_call_table(,%eax,4)
+sysenter_after_call:
+ movl %eax,PT_EAX(%esp)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx
+ jnz sysexit_audit
+sysenter_exit:
+/* if something modifies registers it must also disable sysexit */
+ movl PT_EIP(%esp), %edx
+ movl PT_OLDESP(%esp), %ecx
+ xorl %ebp,%ebp
+ TRACE_IRQS_ON
+1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
+ ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ jnz syscall_trace_entry
+ /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
+ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+ pushl PT_ESI(%esp) /* a3: 5th arg */
+ pushl PT_EDX+4(%esp) /* a2: 4th arg */
+ call __audit_syscall_entry
+ popl %ecx /* get that remapped edx off the stack */
+ popl %ecx /* get that remapped esi off the stack */
+ movl PT_EAX(%esp),%eax /* reload syscall number */
+ jmp sysenter_do_call
+
+sysexit_audit:
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ movl %eax,%edx /* second arg, syscall return value */
+ cmpl $-MAX_ERRNO,%eax /* is it an error ? */
+ setbe %al /* 1 if so, 0 if not */
+ movzbl %al,%eax /* zero-extend that */
+ call __audit_syscall_exit
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
+ movl PT_EAX(%esp),%eax /* reload syscall return value */
+ jmp sysenter_exit
+#endif
+
+.pushsection .fixup,"ax"
+2: movl $0,PT_FS(%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b,2b)
+ PTGS_TO_GS_EX
+ENDPROC(ia32_sysenter_target)
+
+ # system call handler stub
+ENTRY(system_call)
+ ASM_CLAC
+ pushl %eax # save orig_eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+ # system call tracing in operation / emulation
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ jnz syscall_trace_entry
+ cmpl $(NR_syscalls), %eax
+ jae syscall_badsys
+syscall_call:
+ call *sys_call_table(,%eax,4)
+syscall_after_call:
+ movl %eax,PT_EAX(%esp) # store the return value
+syscall_exit:
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
+ jnz syscall_exit_work
+
+restore_all:
+ TRACE_IRQS_IRET
+restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ # are returning to the kernel.
+ # See comments in process.c:copy_thread() for details.
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ je ldt_ss # returning to user-space with LDT SS
+#endif
+restore_nocheck:
+ RESTORE_REGS 4 # skip orig_eax/error_code
+irq_return:
+ INTERRUPT_RETURN
+.section .fixup,"ax"
+ENTRY(iret_exc)
+ pushl $0 # no error code
+ pushl $do_iret_error
+ jmp error_code
+.previous
+ _ASM_EXTABLE(irq_return,iret_exc)
+
+#ifdef CONFIG_X86_ESPFIX32
+ldt_ss:
+#ifdef CONFIG_PARAVIRT
+ /*
+ * The kernel can't run on a non-flat stack if paravirt mode
+ * is active. Rather than try to fixup the high bits of
+ * ESP, bypass this code entirely. This may break DOSemu
+ * and/or Wine support in a paravirt VM, although the option
+ * is still available to implement the setting of the high
+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
+ */
+ cmpl $0, pv_info+PARAVIRT_enabled
+ jne restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /* Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the iret */
+ DISABLE_INTERRUPTS(CLBR_EAX)
+ lss (%esp), %esp /* switch to espfix segment */
+ jmp restore_nocheck
+#endif
+ENDPROC(system_call)
+
+ # perform work that needs to be done immediately before resumption
+ ALIGN
+work_pending:
+ testb $_TIF_NEED_RESCHED, %cl
+ jz work_notifysig
+work_resched:
+ call schedule
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
+ # than syscall tracing?
+ jz restore_all
+ testb $_TIF_NEED_RESCHED, %cl
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and
+ # notify-resume requests
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+ movl %esp, %eax
+ jnz work_notifysig_v86 # returning to kernel-space or
+ # vm86-space
+1:
+#else
+ movl %esp, %eax
+#endif
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
+ xorl %edx, %edx
+ call do_notify_resume
+ jmp resume_userspace
+
+#ifdef CONFIG_VM86
+ ALIGN
+work_notifysig_v86:
+ pushl %ecx # save ti_flags for do_notify_resume
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ movl %eax, %esp
+ jmp 1b
+#endif
+END(work_pending)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_trace_entry:
+ movl $-ENOSYS,PT_EAX(%esp)
+ movl %esp, %eax
+ call syscall_trace_enter
+ /* What it returned is what we'll actually use. */
+ cmpl $(NR_syscalls), %eax
+ jnae syscall_call
+ jmp syscall_exit
+END(syscall_trace_entry)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_exit_work:
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
+ jz work_pending
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
+ # schedule() instead
+ movl %esp, %eax
+ call syscall_trace_leave
+ jmp resume_userspace
+END(syscall_exit_work)
+
+syscall_fault:
+ ASM_CLAC
+ GET_THREAD_INFO(%ebp)
+ movl $-EFAULT,PT_EAX(%esp)
+ jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
+END(sysenter_badsys)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+ /* fixup the stack */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
+ pushl $__KERNEL_DS
+ pushl %eax
+ lss (%esp), %esp /* switch to the normal stack segment */
+#endif
+.endm
+.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+#endif
+.endm
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+ .align 8
+ENTRY(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
+ .align 8
+ .endr
+END(irq_entries_start)
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+ ASM_CLAC
+ addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ movl %esp,%eax
+ call do_IRQ
+ jmp ret_from_intr
+ENDPROC(common_interrupt)
+
+#define BUILD_INTERRUPT3(name, nr, fn) \
+ENTRY(name) \
+ ASM_CLAC; \
+ pushl $~(nr); \
+ SAVE_ALL; \
+ TRACE_IRQS_OFF \
+ movl %esp,%eax; \
+ call fn; \
+ jmp ret_from_intr; \
+ENDPROC(name)
+
+
+#ifdef CONFIG_TRACING
+#define TRACE_BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+#define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(name, nr, smp_##name); \
+ TRACE_BUILD_INTERRUPT(name, nr)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+ENTRY(coprocessor_error)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_error
+ jmp error_code
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+ ASM_CLAC
+ pushl $0
+#ifdef CONFIG_X86_INVD_BUG
+ /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+ ALTERNATIVE "pushl $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
+#else
+ pushl $do_simd_coprocessor_error
+#endif
+ jmp error_code
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ pushl $do_device_not_available
+ jmp error_code
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+ iret
+ _ASM_EXTABLE(native_iret, iret_exc)
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+ sti
+ sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+ ASM_CLAC
+ pushl $0
+ pushl $do_overflow
+ jmp error_code
+END(overflow)
+
+ENTRY(bounds)
+ ASM_CLAC
+ pushl $0
+ pushl $do_bounds
+ jmp error_code
+END(bounds)
+
+ENTRY(invalid_op)
+ ASM_CLAC
+ pushl $0
+ pushl $do_invalid_op
+ jmp error_code
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_segment_overrun
+ jmp error_code
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+ ASM_CLAC
+ pushl $do_invalid_TSS
+ jmp error_code
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+ ASM_CLAC
+ pushl $do_segment_not_present
+ jmp error_code
+END(segment_not_present)
+
+ENTRY(stack_segment)
+ ASM_CLAC
+ pushl $do_stack_segment
+ jmp error_code
+END(stack_segment)
+
+ENTRY(alignment_check)
+ ASM_CLAC
+ pushl $do_alignment_check
+ jmp error_code
+END(alignment_check)
+
+ENTRY(divide_error)
+ ASM_CLAC
+ pushl $0 # no error code
+ pushl $do_divide_error
+ jmp error_code
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+ ASM_CLAC
+ pushl $0
+ pushl machine_check_vector
+ jmp error_code
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+ ASM_CLAC
+ pushl $0
+ pushl $do_spurious_interrupt_bug
+ jmp error_code
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_XEN
+/* Xen doesn't set %esp to be precisely what the normal sysenter
+ entrypoint expects, so fix it up before using the normal path. */
+ENTRY(xen_sysenter_target)
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+
+ENTRY(xen_hypervisor_callback)
+ pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ TRACE_IRQS_OFF
+
+ /* Check to see if we got the event in the critical
+ region in xen_iret_direct, after we've reenabled
+ events and checked for pending events. This simulates
+ iret instruction's behaviour where it delivers a
+ pending interrupt when enabling interrupts. */
+ movl PT_EIP(%esp),%eax
+ cmpl $xen_iret_start_crit,%eax
+ jb 1f
+ cmpl $xen_iret_end_crit,%eax
+ jae 1f
+
+ jmp xen_iret_crit_fixup
+
+ENTRY(xen_do_upcall)
+1: mov %esp, %eax
+ call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
+ jmp ret_from_intr
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+ pushl %eax
+ movl $1,%eax
+1: mov 4(%esp),%ds
+2: mov 8(%esp),%es
+3: mov 12(%esp),%fs
+4: mov 16(%esp),%gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
+ testl %eax,%eax
+ popl %eax
+ lea 16(%esp),%esp
+ jz 5f
+ jmp iret_exc
+5: pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ jmp ret_from_exception
+
+.section .fixup,"ax"
+6: xorl %eax,%eax
+ movl %eax,4(%esp)
+ jmp 1b
+7: xorl %eax,%eax
+ movl %eax,8(%esp)
+ jmp 2b
+8: xorl %eax,%eax
+ movl %eax,12(%esp)
+ jmp 3b
+9: xorl %eax,%eax
+ movl %eax,16(%esp)
+ jmp 4b
+.previous
+ _ASM_EXTABLE(1b,6b)
+ _ASM_EXTABLE(2b,7b)
+ _ASM_EXTABLE(3b,8b)
+ _ASM_EXTABLE(4b,9b)
+ENDPROC(xen_failsafe_callback)
+
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ xen_evtchn_do_upcall)
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+ ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
+ movl 0x4(%ebp), %edx
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4,%esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+ftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ pushf /* push flags before compare (in cs location) */
+
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * ip location, and move flags into the return ip location.
+ */
+ pushl 4(%esp) /* save return ip into ip slot */
+
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS,13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
+
+ popf
+ jmp ftrace_stub
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+trace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+ movl %ebp, %eax
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ jmp *%ecx
+#endif
+
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ ASM_CLAC
+ pushl $trace_do_page_fault
+ jmp error_code
+END(trace_page_fault)
+#endif
+
+ENTRY(page_fault)
+ ASM_CLAC
+ pushl $do_page_fault
+ ALIGN
+error_code:
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ cld
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ pushfl
+ pushl $__KERNEL_CS
+ pushl $sysenter_past_esp
+.endm
+
+ENTRY(debug)
+ ASM_CLAC
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+ pushl $-1 # mark this as an int
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
+ pushl %eax
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ je nmi_espfix_stack
+#endif
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ pushl %eax
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_all_notrace
+
+nmi_stack_fixup:
+ FIX_STACK 12, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK 24, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+#ifdef CONFIG_X86_ESPFIX32
+nmi_espfix_stack:
+ /*
+ * create the pointer to lss back
+ */
+ pushl %ss
+ pushl %esp
+ addl $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ .endr
+ pushl %eax
+ SAVE_ALL
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ jmp irq_return
+#endif
+END(nmi)
+
+ENTRY(int3)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+END(int3)
+
+ENTRY(general_protection)
+ pushl $do_general_protection
+ jmp error_code
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ ASM_CLAC
+ pushl $do_async_page_fault
+ jmp error_code
+END(async_page_fault)
+#endif
+
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
new file mode 100644
index 000000000000..4ad79e946f5a
--- /dev/null
+++ b/arch/x86/entry/entry_64.S
@@ -0,0 +1,1442 @@
+/*
+ * linux/arch/x86_64/entry.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ *
+ * A note on terminology:
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ *
+ * Some macro usage:
+ * - ENTRY/END Define functions in the symbol table.
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - idtentry - Define exception entry points.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/paravirt.h>
+#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/context_tracking.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE 0x40000000
+
+ .code64
+ .section .entry.text, "ax"
+
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret64)
+ swapgs
+ sysretq
+ENDPROC(native_usergs_sysret64)
+#endif /* CONFIG_PARAVIRT */
+
+
+.macro TRACE_IRQS_IRETQ
+#ifdef CONFIG_TRACE_IRQFLAGS
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ *
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
+ * rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
+ * rdi arg0
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
+ * r8 arg4
+ * r9 arg5
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ *
+ * Only called from user space.
+ *
+ * When user can change pt_regs->foo always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ENTRY(system_call)
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+ SWAPGS_UNSAFE_STACK
+ /*
+ * A hypervisor implementation might want to use a label
+ * after the swapgs, so that it can do the swapgs
+ * for the guest and jump here on syscall.
+ */
+GLOBAL(system_call_after_swapgs)
+
+ movq %rsp,PER_CPU_VAR(rsp_scratch)
+ movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
+ /*
+ * Re-enable interrupts.
+ * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+ * must execute atomically in the face of possible interrupt-driven
+ * task preemption. We must enable interrupts only after we're done
+ * with using rsp_scratch:
+ */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz tracesys
+system_call_fastpath:
+#if __SYSCALL_MASK == ~0
+ cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10,%rcx
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX(%rsp)
+1:
+/*
+ * Syscall return path ending with SYSRET (fast path).
+ * Has incompletely filled pt_regs.
+ */
+ LOCKDEP_SYS_EXIT
+ /*
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+
+ /*
+ * We must check ti flags with interrupts (or at least preemption)
+ * off because we must *never* return to userspace without
+ * processing exit work that is enqueued if we're preempted here.
+ * In particular, returning to userspace with any of the one-shot
+ * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
+ * very bad.
+ */
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
+
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RIP(%rsp),%rcx
+ movq EFLAGS(%rsp),%r11
+ movq RSP(%rsp),%rsp
+ /*
+ * 64bit SYSRET restores rip from rcx,
+ * rflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * Restoration of rflags re-enables interrupts.
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we should
+ * avoid SYSRET with SS == NULL, which could happen if we schedule,
+ * exit the kernel, and re-enter using an interrupt vector. (All
+ * interrupt entries on x86_64 set SS to NULL.) We prevent that
+ * from happening by reloading SS in __switch_to. (Actually
+ * detecting the failure in 64-bit userspace is tricky but can be
+ * done.)
+ */
+ USERGS_SYSRET64
+
+ /* Do syscall entry tracing */
+tracesys:
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ call syscall_trace_enter_phase1
+ test %rax, %rax
+ jnz tracesys_phase2 /* if needed, run the slow path */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
+ jmp system_call_fastpath /* and return to the fast path */
+
+tracesys_phase2:
+ SAVE_EXTRA_REGS
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ movq %rax,%rdx
+ call syscall_trace_enter_phase2
+
+ /*
+ * Reload registers from stack in case ptrace changed them.
+ * We don't reload %rax because syscall_trace_entry_phase2() returned
+ * the value it wants us to use in the table lookup.
+ */
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
+#if __SYSCALL_MASK == ~0
+ cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10,%rcx /* fixup for C */
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX(%rsp)
+1:
+ /* Use IRET because user could have changed pt_regs->foo */
+
+/*
+ * Syscall return path ending with IRET.
+ * Has correct iret frame.
+ */
+GLOBAL(int_ret_from_sys_call)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
+ TRACE_IRQS_OFF
+ movl $_TIF_ALLWORK_MASK,%edi
+ /* edi: mask to check */
+GLOBAL(int_with_check)
+ LOCKDEP_SYS_EXIT_IRQ
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz int_careful
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return
+
+ /* Either reschedule or signal or syscall exit tracking needed. */
+ /* First do a reschedule test. */
+ /* edx: work, edi: workmask */
+int_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc int_very_careful
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ SCHEDULE_USER
+ popq %rdi
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+ /* handle signals and tracing -- both require a full pt_regs */
+int_very_careful:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_EXTRA_REGS
+ /* Check for syscall exit trace */
+ testl $_TIF_WORK_SYSCALL_EXIT,%edx
+ jz int_signal
+ pushq %rdi
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
+ call syscall_trace_leave
+ popq %rdi
+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
+ jmp int_restore_rest
+
+int_signal:
+ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz 1f
+ movq %rsp,%rdi # &ptregs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+ call do_notify_resume
+1: movl $_TIF_WORK_MASK,%edi
+int_restore_rest:
+ RESTORE_EXTRA_REGS
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+syscall_return:
+ /* The IRETQ could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ movq RIP(%rsp),%r11
+ cmpq %rcx,%r11 /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * If width of "canonical tail" ever becomes variable, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- SYSRET checks need update"
+ .endif
+ /* Change top 16 bits to be the sign-extension of 47th bit */
+ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+ sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+ /* If this changed %rcx, it was not canonical */
+ cmpq %rcx, %r11
+ jne opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_c_regs_and_iret
+END(system_call)
+
+
+ .macro FORK_LIKE func
+ENTRY(stub_\func)
+ SAVE_EXTRA_REGS 8
+ jmp sys_\func
+END(stub_\func)
+ .endm
+
+ FORK_LIKE clone
+ FORK_LIKE fork
+ FORK_LIKE vfork
+
+ENTRY(stub_execve)
+ call sys_execve
+return_from_execve:
+ testl %eax, %eax
+ jz 1f
+ /* exec failed, can use fast SYSRET code path in this case */
+ ret
+1:
+ /* must use IRET code path (pt_regs->cs may have changed) */
+ addq $8, %rsp
+ ZERO_EXTRA_REGS
+ movq %rax,RAX(%rsp)
+ jmp int_ret_from_sys_call
+END(stub_execve)
+/*
+ * Remaining execve stubs are only 7 bytes long.
+ * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
+ */
+ .align 8
+GLOBAL(stub_execveat)
+ call sys_execveat
+ jmp return_from_execve
+END(stub_execveat)
+
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+ .align 8
+GLOBAL(stub_x32_execve)
+GLOBAL(stub32_execve)
+ call compat_sys_execve
+ jmp return_from_execve
+END(stub32_execve)
+END(stub_x32_execve)
+ .align 8
+GLOBAL(stub_x32_execveat)
+GLOBAL(stub32_execveat)
+ call compat_sys_execveat
+ jmp return_from_execve
+END(stub32_execveat)
+END(stub_x32_execveat)
+#endif
+
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */
+ENTRY(stub_rt_sigreturn)
+ /*
+ * SAVE_EXTRA_REGS result is not normally needed:
+ * sigreturn overwrites all pt_regs->GPREGS.
+ * But sigreturn can fail (!), and there is no easy way to detect that.
+ * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
+ * we SAVE_EXTRA_REGS here.
+ */
+ SAVE_EXTRA_REGS 8
+ call sys_rt_sigreturn
+return_from_stub:
+ addq $8, %rsp
+ RESTORE_EXTRA_REGS
+ movq %rax,RAX(%rsp)
+ jmp int_ret_from_sys_call
+END(stub_rt_sigreturn)
+
+#ifdef CONFIG_X86_X32_ABI
+ENTRY(stub_x32_rt_sigreturn)
+ SAVE_EXTRA_REGS 8
+ call sys32_x32_rt_sigreturn
+ jmp return_from_stub
+END(stub_x32_rt_sigreturn)
+#endif
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+ pushq $0x0002
+ popfq # reset kernel eflags
+
+ call schedule_tail # rdi: 'prev' task parameter
+
+ RESTORE_EXTRA_REGS
+
+ testb $3, CS(%rsp) # from kernel_thread?
+
+ /*
+ * By the time we get here, we have no idea whether our pt_regs,
+ * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+ * the slow path, or one of the ia32entry paths.
+ * Use IRET code path to return, since it can safely handle
+ * all of the above.
+ */
+ jnz int_ret_from_sys_call
+
+ /* We came from kernel_thread */
+ /* nb: we depend on RESTORE_EXTRA_REGS above */
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_EXTRA_REGS
+ jmp int_ret_from_sys_call
+END(ret_from_fork)
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+ .align 8
+ENTRY(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
+ .align 8
+ .endr
+END(irq_entries_start)
+
+/*
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *
+ * Entry runs with interrupts off.
+ */
+
+/* 0(%rsp): ~(interrupt number) */
+ .macro interrupt func
+ cld
+ /*
+ * Since nothing in interrupt handling code touches r12...r15 members
+ * of "struct pt_regs", and since interrupts can nest, we can save
+ * four stack slots and simultaneously provide
+ * an unwind-friendly stack layout by saving "truncated" pt_regs
+ * exactly up to rbp slot, without these members.
+ */
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ SAVE_C_REGS -RBP
+ /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+ SAVE_EXTRA_REGS_RBP -RBP
+
+ leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
+
+ testb $3, CS-RBP(%rsp)
+ jz 1f
+ SWAPGS
+1:
+ /*
+ * Save previous stack pointer, optionally switch to interrupt stack.
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ pushq %rsi
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+
+ call \func
+ .endm
+
+ /*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_interrupt.
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+ ASM_CLAC
+ addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
+ interrupt do_IRQ
+ /* 0(%rsp): old RSP */
+ret_from_intr:
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ decl PER_CPU_VAR(irq_count)
+
+ /* Restore saved previous stack */
+ popq %rsi
+ /* return code expects complete pt_regs - adjust rsp accordingly: */
+ leaq -RBP(%rsi),%rsp
+
+ testb $3, CS(%rsp)
+ jz retint_kernel
+ /* Interrupt came from user space */
+retint_user:
+ GET_THREAD_INFO(%rcx)
+ /*
+ * %rcx: thread info. Interrupts off.
+ */
+retint_with_reschedule:
+ movl $_TIF_WORK_MASK,%edi
+retint_check:
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz retint_careful
+
+retint_swapgs: /* return to user-space */
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ SWAPGS
+ jmp restore_c_regs_and_iret
+
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+ /* Interrupts are off */
+ /* Check if we need preemption */
+ bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ jnc 1f
+0: cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz 1f
+ call preempt_schedule_irq
+ jmp 0b
+1:
+#endif
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ TRACE_IRQS_IRETQ
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+
+irq_return:
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+.global native_irq_return_iret
+native_irq_return_iret:
+ /*
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in do_double_fault.
+ * Other faults here are fatal.
+ */
+ iretq
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq %rax
+ pushq %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq %rax
+ jmp native_irq_return_iret
+#endif
+
+ /* edi: workmask, edx: work */
+retint_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc retint_signal
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ SCHEDULE_USER
+ popq %rdi
+ GET_THREAD_INFO(%rcx)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp retint_check
+
+retint_signal:
+ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz retint_swapgs
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_EXTRA_REGS
+ movq $-1,ORIG_RAX(%rsp)
+ xorl %esi,%esi # oldset
+ movq %rsp,%rdi # &pt_regs
+ call do_notify_resume
+ RESTORE_EXTRA_REGS
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ jmp retint_with_reschedule
+
+END(common_interrupt)
+
+/*
+ * APIC interrupts.
+ */
+.macro apicinterrupt3 num sym do_sym
+ENTRY(\sym)
+ ASM_CLAC
+ pushq $~(\num)
+.Lcommon_\sym:
+ interrupt \do_sym
+ jmp ret_from_intr
+END(\sym)
+.endm
+
+#ifdef CONFIG_TRACING
+#define trace(sym) trace_##sym
+#define smp_trace(sym) smp_trace_##sym
+
+.macro trace_apicinterrupt num sym
+apicinterrupt3 \num trace(\sym) smp_trace(\sym)
+.endm
+#else
+.macro trace_apicinterrupt num sym do_sym
+.endm
+#endif
+
+.macro apicinterrupt num sym do_sym
+apicinterrupt3 \num \sym \do_sym
+trace_apicinterrupt \num \sym
+.endm
+
+#ifdef CONFIG_SMP
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
+ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt3 REBOOT_VECTOR \
+ reboot_interrupt smp_reboot_interrupt
+#endif
+
+#ifdef CONFIG_X86_UV
+apicinterrupt3 UV_BAU_MESSAGE \
+ uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
+apicinterrupt LOCAL_TIMER_VECTOR \
+ apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+ x86_platform_ipi smp_x86_platform_ipi
+
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt3 POSTED_INTR_VECTOR \
+ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+apicinterrupt THRESHOLD_APIC_VECTOR \
+ threshold_interrupt smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+apicinterrupt THERMAL_APIC_VECTOR \
+ thermal_interrupt smp_thermal_interrupt
+#endif
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
+#endif
+
+apicinterrupt ERROR_APIC_VECTOR \
+ error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+ spurious_interrupt smp_spurious_interrupt
+
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+ irq_work_interrupt smp_irq_work_interrupt
+#endif
+
+/*
+ * Exception entry points.
+ */
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+ENTRY(\sym)
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
+
+ ASM_CLAC
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+
+ .ifeq \has_error_code
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
+ ALLOC_PT_GPREGS_ON_STACK
+
+ .if \paranoid
+ .if \paranoid == 1
+ testb $3, CS(%rsp) /* If coming from userspace, switch */
+ jnz 1f /* stacks. */
+ .endif
+ call paranoid_entry
+ .else
+ call error_entry
+ .endif
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
+ TRACE_IRQS_OFF
+ .endif
+ .endif
+
+ movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ /* these procedures expect "no swapgs" flag in ebx */
+ .if \paranoid
+ jmp paranoid_exit
+ .else
+ jmp error_exit
+ .endif
+
+ .if \paranoid == 1
+ /*
+ * Paranoid entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+1:
+ call error_entry
+
+
+ movq %rsp,%rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax,%rsp /* switch stack */
+
+ movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ call \do_sym
+
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+END(\sym)
+.endm
+
+#ifdef CONFIG_TRACING
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#else
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#endif
+
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+
+
+ /* Reload gs selector with exception handling */
+ /* edi: new selector */
+ENTRY(native_load_gs_index)
+ pushfq
+ DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
+ SWAPGS
+gs_change:
+ movl %edi,%gs
+2: mfence /* workaround */
+ SWAPGS
+ popfq
+ ret
+END(native_load_gs_index)
+
+ _ASM_EXTABLE(gs_change,bad_gs)
+ .section .fixup,"ax"
+ /* running with kernelgs */
+bad_gs:
+ SWAPGS /* switch back to user gs */
+ xorl %eax,%eax
+ movl %eax,%gs
+ jmp 2b
+ .previous
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(do_softirq_own_stack)
+ pushq %rbp
+ mov %rsp,%rbp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr),%rsp
+ push %rbp # backlink for old unwinder
+ call __do_softirq
+ leaveq
+ decl PER_CPU_VAR(irq_count)
+ ret
+END(do_softirq_own_stack)
+
+#ifdef CONFIG_XEN
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+
+/*
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ */
+ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
+ movq %rdi, %rsp # we don't return, adjust the stack frame
+11: incl PER_CPU_VAR(irq_count)
+ movq %rsp,%rbp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ pushq %rbp # backlink for old unwinder
+ call xen_evtchn_do_upcall
+ popq %rsp
+ decl PER_CPU_VAR(irq_count)
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
+ jmp error_exit
+END(xen_do_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
+ENTRY(xen_failsafe_callback)
+ movl %ds,%ecx
+ cmpw %cx,0x10(%rsp)
+ jne 1f
+ movl %es,%ecx
+ cmpw %cx,0x18(%rsp)
+ jne 1f
+ movl %fs,%ecx
+ cmpw %cx,0x20(%rsp)
+ jne 1f
+ movl %gs,%ecx
+ cmpw %cx,0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp),%rcx
+ movq 8(%rsp),%r11
+ addq $0x30,%rsp
+ pushq $0 /* RIP */
+ pushq %r11
+ pushq %rcx
+ jmp general_protection
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp),%rcx
+ movq 8(%rsp),%r11
+ addq $0x30,%rsp
+ pushq $-1 /* orig_ax = -1 => not a system call */
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+ jmp error_exit
+END(xen_failsafe_callback)
+
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ hyperv_callback_vector hyperv_vector_handler
+#endif /* CONFIG_HYPERV */
+
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1
+#ifdef CONFIG_XEN
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
+#endif
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
+#ifdef CONFIG_KVM_GUEST
+idtentry async_page_fault do_async_page_fault has_error_code=1
+#endif
+#ifdef CONFIG_X86_MCE
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
+#endif
+
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+END(paranoid_entry)
+
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
+ENTRY(paranoid_exit)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF_DEBUG
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
+ SWAPGS_UNSAFE_STACK
+ jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+ INTERRUPT_RETURN
+END(paranoid_exit)
+
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(error_entry)
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ xorl %ebx,%ebx
+ testb $3, CS+8(%rsp)
+ jz error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+ TRACE_IRQS_OFF
+ ret
+
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq native_irq_return_iret(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_bad_iret
+ movl %ecx,%eax /* zero extend */
+ cmpq %rax,RIP+8(%rsp)
+ je bstep_iret
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+
+bstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx,RIP+8(%rsp)
+ /* fall through */
+
+error_bad_iret:
+ SWAPGS
+ mov %rsp,%rdi
+ call fixup_bad_iret
+ mov %rax,%rsp
+ decl %ebx /* Return to usergs */
+ jmp error_sti
+END(error_entry)
+
+
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
+ENTRY(error_exit)
+ movl %ebx,%eax
+ RESTORE_EXTRA_REGS
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %eax,%eax
+ jnz retint_kernel
+ jmp retint_user
+END(error_exit)
+
+/* Runs on exception stack */
+ENTRY(nmi)
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as our temp variable throughout */
+ pushq %rdx
+
+ /*
+ * If %cs was not the kernel segment, then the NMI triggered in user
+ * space, which means it is definitely not nested.
+ */
+ cmpl $__KERNEL_CS, 16(%rsp)
+ jne first_nmi
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmpl $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+ /* Ah, it is within the NMI stack, treat it as nested */
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -1*8(%rsp), %rdx
+ movq %rdx, %rsp
+ leaq -10*8(%rsp), %rdx
+ pushq $__KERNEL_DS
+ pushq %rdx
+ pushfq
+ pushq $__KERNEL_CS
+ pushq $repeat_nmi
+
+ /* Put stack back */
+ addq $(6*8), %rsp
+
+nested_nmi_out:
+ popq %rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved stack frame is used to fix up the copied stack frame
+ * that a nested NMI may change to make the interrupted NMI iret jump
+ * to the repeat_nmi. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ movq (%rsp), %rdx
+
+ /* Set the NMI executing variable on the stack. */
+ pushq $1
+
+ /*
+ * Leave room for the "copied" frame
+ */
+ subq $(5*8), %rsp
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq 11*8(%rsp)
+ .endr
+
+ /* Everything up to here is safe from nested NMIs */
+
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ */
+repeat_nmi:
+ /*
+ * Update the stack variable to say we are still in NMI (the update
+ * is benign for the non-repeat case, where 1 was pushed just above
+ * to this very stack slot).
+ */
+ movq $1, 10*8(%rsp)
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ addq $(10*8), %rsp
+ .rept 5
+ pushq -6*8(%rsp)
+ .endr
+ subq $(5*8), %rsp
+end_repeat_nmi:
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception and reset our iret stack
+ * so that we repeat another NMI.
+ */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ ALLOC_PT_GPREGS_ON_STACK
+
+ /*
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
+ call paranoid_entry
+
+ /*
+ * Save off the CR2 register. If we take a page fault in the NMI then
+ * it could corrupt the CR2 value. If the NMI preempts a page fault
+ * handler before it was able to read the CR2 register, and then the
+ * NMI itself takes a page fault, the page fault that was preempted
+ * will read the information from the NMI page fault and not the
+ * origin fault. Save it off and restore it if it changes.
+ * Use the r12 callee-saved register.
+ */
+ movq %cr2, %r12
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+
+ /* Did the NMI take a page fault? Restore cr2 if it did */
+ movq %cr2, %rcx
+ cmpq %rcx, %r12
+ je 1f
+ movq %r12, %cr2
+1:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ /* Pop the extra iret frame at once */
+ REMOVE_PT_GPREGS_FROM_STACK 6*8
+
+ /* Clear the NMI executing stack variable */
+ movq $0, 5*8(%rsp)
+ jmp irq_return
+END(nmi)
+
+ENTRY(ignore_sysret)
+ mov $-ENOSYS,%eax
+ sysret
+END(ignore_sysret)
+