diff options
Diffstat (limited to 'arch/x86')
183 files changed, 5850 insertions, 6351 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 3e6f64073005..0038a2d10a7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/ -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ - obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87e447286c37..4b278a33ccbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,6 +55,8 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN @@ -73,7 +75,6 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -158,6 +159,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -169,7 +171,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -427,16 +429,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -780,8 +782,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT @@ -1806,7 +1806,9 @@ config X86_SMAP config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n - depends on CPU_SUP_INTEL + # Note: only available in 64-bit mode due to VMA flags shortage + depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS ---help--- MPX provides hardware features that can be used in conjunction with compiler-instrumented code to check diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index cd20ca0b4043..71a48a30fc84 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1e902f926be3..6276572259c8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -115,7 +117,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) @@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c3e869eaef0c..926c2cc4facc 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params, m |= (u64)efi->efi_memmap_hi << 32; #endif - d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); + d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; @@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = 0; desc->dpl = 0; desc->p = 1; - desc->limit = 0x0; + desc->limit1 = 0x0; desc->avl = 0; desc->l = 0; desc->d = 0; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d85b9625e836..11c68cf53d4e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -61,71 +61,6 @@ __HEAD ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call 1f -1: popl %esi - subl $1b, %esi - - popl %ecx - movl %ecx, efi32_config(%esi) /* Handle */ - popl %ecx - movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax - - call make_boot_params - cmpl $0, %eax - je fail - movl %esi, BP_code32_start(%eax) - popl %ecx - pushl %eax - pushl %ecx - jmp 2f /* Skip efi_config initialization */ - -ENTRY(efi32_stub_entry) - add $0x4, %esp - popl %ecx - popl %edx - - call 1f -1: popl %esi - subl $1b, %esi - - movl %ecx, efi32_config(%esi) /* Handle */ - movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax -2: - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax - -preferred_addr: -#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -208,6 +143,70 @@ preferred_addr: jmp *%eax ENDPROC(startup_32) +#ifdef CONFIG_EFI_STUB +/* + * We don't need the return address, so set up the stack so efi_main() can find + * its arguments. + */ +ENTRY(efi_pe_entry) + add $0x4, %esp + + call 1f +1: popl %esi + subl $1b, %esi + + popl %ecx + movl %ecx, efi32_config(%esi) /* Handle */ + popl %ecx + movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax + + call make_boot_params + cmpl $0, %eax + je fail + movl %esi, BP_code32_start(%eax) + popl %ecx + pushl %eax + pushl %ecx + jmp 2f /* Skip efi_config initialization */ +ENDPROC(efi_pe_entry) + +ENTRY(efi32_stub_entry) + add $0x4, %esp + popl %ecx + popl %edx + + call 1f +1: popl %esi + subl $1b, %esi + + movl %ecx, efi32_config(%esi) /* Handle */ + movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax +2: + call efi_main + cmpl $0, %eax + movl %eax, %esi + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leal startup_32(%eax), %eax + jmp *%eax +ENDPROC(efi32_stub_entry) +#endif + .text relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fbf4c32d0b62..b4a5d284391c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -243,65 +243,6 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - movq %rcx, efi64_config(%rip) /* Handle */ - movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ - - leaq efi64_config(%rip), %rax - movq %rax, efi_config(%rip) - - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - addq %rbp, efi64_config+40(%rip) - - movq %rax, %rdi - call make_boot_params - cmpq $0,%rax - je fail - mov %rax, %rsi - leaq startup_32(%rip), %rax - movl %eax, BP_code32_start(%rsi) - jmp 2f /* Skip the relocation */ - -handover_entry: - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - movq efi_config(%rip), %rax - addq %rbp, 40(%rax) -2: - movq efi_config(%rip), %rdi - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax - -preferred_addr: -#endif /* Setup data segments. */ xorl %eax, %eax @@ -413,6 +354,59 @@ lvl5: jmp *%rax #ifdef CONFIG_EFI_STUB + +/* The entry point for the PE/COFF executable is efi_pe_entry. */ +ENTRY(efi_pe_entry) + movq %rcx, efi64_config(%rip) /* Handle */ + movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ + + leaq efi64_config(%rip), %rax + movq %rax, efi_config(%rip) + + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + addq %rbp, efi64_config+40(%rip) + + movq %rax, %rdi + call make_boot_params + cmpq $0,%rax + je fail + mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) + jmp 2f /* Skip the relocation */ + +handover_entry: + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + movq efi_config(%rip), %rax + addq %rbp, 40(%rax) +2: + movq efi_config(%rip), %rdi + call efi_main + movq %rax,%rsi + cmpq $0,%rax + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leaq startup_64(%rax), %rax + jmp *%rax +ENDPROC(efi_pe_entry) + .org 0x390 ENTRY(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 99c7194f7ea6..17818ba6906f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -37,7 +37,9 @@ #include <linux/uts.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/efi.h> #include <generated/utsrelease.h> +#include <asm/efi.h> /* Macros used by the included decompressor code below. */ #define STATIC @@ -558,6 +560,87 @@ static void process_mem_region(struct mem_vector *entry, } } +#ifdef CONFIG_EFI +/* + * Returns true if mirror region found (and must have been processed + * for slots adding) + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + struct efi_info *e = &boot_params->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; + int i; + + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; + } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif + + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + /* + * Here we are more conservative in picking free memory than + * the EFI spec allows: + * + * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also + * free memory and thus available to place the kernel image into, + * but in practice there's firmware where using that memory leads + * to crashes. + * + * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. + */ + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + break; + } + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + static void process_e820_entries(unsigned long minimum, unsigned long image_size) { @@ -586,13 +669,16 @@ static unsigned long find_random_phys_addr(unsigned long minimum, { /* Check if we had too many memmaps. */ if (memmap_too_large) { - debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0; } /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + if (process_efi_entries(minimum, image_size)) + return slots_fetch_random(); + process_e820_entries(minimum, image_size); return slots_fetch_random(); } @@ -652,7 +738,7 @@ void choose_random_location(unsigned long input, */ min_addr = min(*output, 512UL << 20); - /* Walk e820 and find a random address. */ + /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("Physical KASLR disabled: no suitable memory region!"); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a0838ab929f2..c14217cd0155 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -116,8 +116,7 @@ void __putstr(const char *s) } } - if (boot_params->screen_info.orig_video_mode == 0 && - lines == 0 && cols == 0) + if (lines == 0 || cols == 0) return; x = boot_params->screen_info.orig_x; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2ed8f0c25def..1bb08ecffd24 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the description in lib/decompressor_xxx.c for specific information. # # extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# LZ4 is even worse: data that cannot be further compressed grows by 0.4%, +# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as +# the size-dependent part now grows so fast. +# +# extra_bytes = (uncompressed_size >> 8) + 65536 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4a55cdcdc008..5c15d6b57329 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, unsigned int nbytes = walk->nbytes; aesni_enc(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 17c05531dfd1..f9eca34301e2 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) unsigned int nbytes = walk->nbytes; blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, BF_BLOCK_SIZE); } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 8648158f3916..dbea6020ffe7 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index d6fc59aaaadf..30c0a37f4882 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, unsigned int nbytes = walk->nbytes; des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..af28a8a24366 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..640aafebdc00 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,5 @@ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cdefcfdd9e63..03505ffbe1b6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/livepatch.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb32c42..8a13d468635a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -673,16 +673,8 @@ ENTRY(name) \ jmp ret_from_intr; \ ENDPROC(name) - -#ifdef CONFIG_TRACING -# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -# define TRACE_BUILD_INTERRUPT(name, nr) -#endif - #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) + xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) + hyperv_vector_handler) #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6d078b89a5e8..49167258d587 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +44,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -316,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -351,6 +353,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +370,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -406,6 +410,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,6 +418,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -440,13 +446,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -485,17 +580,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +600,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -561,6 +644,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -633,6 +717,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +739,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -662,31 +748,13 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym POP_SECTION_IRQENTRY .endm @@ -740,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -763,6 +832,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -829,17 +899,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -860,6 +919,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +928,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,17 +953,15 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -923,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1010,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -968,13 +1028,13 @@ ENTRY(xen_failsafe_callback) movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -998,13 +1058,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1020,6 +1079,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1047,6 +1107,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1068,6 +1129,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1152,6 +1214,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1160,19 +1223,9 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ +/* XXX: broken on Xen PV */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1234,11 +1287,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,6 +1310,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1409,6 +1465,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1424,6 +1481,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,6 +1531,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1510,17 +1569,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..e26c25ca7756 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat) jmp restore_regs_and_iret END(entry_INT80_compat) - ALIGN -GLOBAL(stub32_clone) +ENTRY(stub32_clone) /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -353,3 +350,4 @@ GLOBAL(stub32_clone) */ xchg %r8, %rcx jmp sys_clone +ENDPROC(stub32_clone) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 726355ce8497..1911310959f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ad44af0dd667..f5cbbba99283 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; + unsigned int nshared, subleaf, prev_eax = 0; uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; + /* + * Iterate over Cache Topology Definition leaves until no + * more cache descriptions are available. + */ + for (subleaf = 0; subleaf < 5; subleaf++) { + cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); + + /* EAX[0:4] gives type of cache */ + if (!(eax & 0x1f)) + break; + + prev_eax = eax; + } + nshared = ((prev_eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); @@ -555,7 +568,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index af12e294caed..80534d3c2480 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_max_precise(void) { - if (event->attr.precise_ip) { - int precise = 0; + int precise = 0; - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; + if (x86_pmu.pebs_prec_dist) + precise++; + } + return precise; +} - if (x86_pmu.pebs_prec_dist) - precise++; - } +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) } static struct attribute_group x86_pmu_attr_group; +static struct attribute_group x86_pmu_caps_group; static int __init init_hw_perf_events(void) { @@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.caps_attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); + if (!WARN_ON(!tmp)) + x86_pmu_caps_group.attrs = tmp; + } + if (x86_pmu.event_attrs) x86_pmu_events_group.attrs = x86_pmu.event_attrs; @@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = { .attrs = x86_pmu_attrs, }; +static ssize_t max_precise_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); +} + +static DEVICE_ATTR_RO(max_precise); + +static struct attribute *x86_pmu_caps_attrs[] = { + &dev_attr_max_precise.attr, + NULL +}; + +static struct attribute_group x86_pmu_caps_group = { + .name = "caps", + .attrs = x86_pmu_caps_attrs, +}; + static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, + &x86_pmu_caps_group, NULL, }; @@ -2335,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - if (idx > LDT_ENTRIES) - return 0; - /* IRQs are off, so this synchronizes with smp_store_release */ ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->nr_entries) + if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; @@ -2348,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment) return 0; #endif } else { - if (idx > GDT_ENTRIES) + if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..e9d8520a801a 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index ddd8d3516bfc..16076eb34699 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags) bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - event->hw.itrace_started = 1; + perf_event_itrace_started(event); event->hw.state = 0; __bts_event_start(event); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 98b0f0729527..829e89cfcee2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + NULL, +}; + +static struct attribute *hsw_format_attr[] = { &format_attr_in_tx.attr, &format_attr_in_tx_cp.attr, + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, +static struct attribute *nhm_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; + +static struct attribute *slm_format_attr[] = { + &format_attr_offcore_rsp.attr, + NULL }; static struct attribute *skl_format_attr[] = { @@ -3781,6 +3795,36 @@ done: static DEVICE_ATTR_RW(freeze_on_smi); +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *lbr_attrs[] = { + &dev_attr_branches.attr, + NULL +}; + +static char pmu_name_str[30]; + +static ssize_t pmu_name_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str); +} + +static DEVICE_ATTR_RO(pmu_name); + +static struct attribute *intel_pmu_caps_attrs[] = { + &dev_attr_pmu_name.attr, + NULL +}; + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, NULL, @@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; + struct attribute **extra_attr = NULL; + char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); + name = "core"; break; case INTEL_FAM6_CORE2_MEROM: @@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core2_event_constraints; x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); + name = "core2"; break; case INTEL_FAM6_NEHALEM: @@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; + extra_attr = nhm_format_attr; pr_cont("Nehalem events, "); + name = "nehalem"; break; case INTEL_FAM6_ATOM_PINEVIEW: @@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); + name = "bonnell"; break; case INTEL_FAM6_ATOM_SILVERMONT1: @@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = slm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Silvermont events, "); + name = "silvermont"; break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = glm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Goldmont events, "); + name = "goldmont"; break; case INTEL_FAM6_ATOM_GEMINI_LAKE: @@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void) x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; + extra_attr = slm_format_attr; pr_cont("Goldmont plus events, "); + name = "goldmont_plus"; break; case INTEL_FAM6_WESTMERE: @@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void) X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); intel_pmu_pebs_data_source_nhm(); + extra_attr = nhm_format_attr; pr_cont("Westmere events, "); + name = "westmere"; break; case INTEL_FAM6_SANDYBRIDGE: @@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("SandyBridge events, "); + name = "sandybridge"; break; case INTEL_FAM6_IVYBRIDGE: @@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("IvyBridge events, "); + name = "ivybridge"; break; @@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Haswell events, "); + name = "haswell"; break; case INTEL_FAM6_BROADWELL_CORE: @@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Broadwell events, "); + name = "broadwell"; break; case INTEL_FAM6_XEON_PHI_KNL: @@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void) /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - + extra_attr = slm_format_attr; pr_cont("Knights Landing/Mill events, "); + name = "knights-landing"; break; case INTEL_FAM6_SKYLAKE_MOBILE: @@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); + name = "skylake"; break; default: @@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void) case 1: x86_pmu.event_constraints = intel_v1_event_constraints; pr_cont("generic architected perfmon v1, "); + name = "generic_arch_v1"; break; default: /* @@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void) */ x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("generic architected perfmon, "); + name = "generic_arch_v2+"; break; } } + snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + + if (version >= 2 && extra_attr) { + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + extra_attr); + WARN_ON(!x86_pmu.format_attrs); + } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + x86_pmu.caps_attrs = intel_pmu_caps_attrs; + + if (x86_pmu.lbr_nr) { + x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + } + /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 2521f771f2f5..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include <asm/intel_rdt_common.h> -#include "../perf_event.h" - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -#define MBM_CNTR_WIDTH 24 -/* - * Guaranteed time in ms as per SDM where MBM counters will not overflow. - */ -#define MBM_CTR_OVERFLOW_TIME 1000 - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -static bool cqm_enabled, mbm_enabled; -unsigned int mbm_socket_max; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); -static struct hrtimer *mbm_timers; -/** - * struct sample - mbm event's (local or total) data - * @total_bytes #bytes since we began monitoring - * @prev_msr previous value of MSR - */ -struct sample { - u64 total_bytes; - u64 prev_msr; -}; - -/* - * samples profiled for total memory bandwidth type events - */ -static struct sample *mbm_total; -/* - * samples profiled for local memory bandwidth type events - */ -static struct sample *mbm_local; - -#define pkg_id topology_physical_package_id(smp_processor_id()) -/* - * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. - * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of - * rmids per socket, an example is given below - * RMID1 of Socket0: vrmid = 1 - * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 - * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 - */ -#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_MBM_LOCAL_EVENT_ID 0x03 - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static void cqm_cleanup(void) -{ - int i; - - if (!cqm_rmid_ptrs) - return; - - for (i = 0; i < cqm_max_rmid; i++) - kfree(cqm_rmid_ptrs[i]); - - kfree(cqm_rmid_ptrs); - cqm_rmid_ptrs = NULL; - cqm_enabled = false; -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; - -fail: - cqm_cleanup(); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - * Mark it as a multi event group, so that we update ->count - * for every event rather than just the group leader later. - */ - if (a->hw.target == b->hw.target) { - b->hw.is_group_event = true; - return true; - } - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - u32 evt_type; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); -static void init_mbm_sample(u32 rmid, u32 evt_type); -static void __intel_mbm_event_count(void *info); - -static bool is_cqm_event(int e) -{ - return (e == QOS_L3_OCCUP_EVENT_ID); -} - -static bool is_mbm_event(int e) -{ - return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); -} - -static void cqm_mask_call(struct rmid_read *rr) -{ - if (is_mbm_event(rr->evt_type)) - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); - else - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); -} - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .rmid = old_rmid, - .evt_type = group->attr.config, - .value = ATOMIC64_INIT(0), - }; - - cqm_mask_call(&rr); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - /* - * If the allocation is for mbm, init the mbm stats. - * Need to check if each event in the group is mbm event - * because there could be multiple type of events in the same group. - */ - if (__rmid_valid(rmid)) { - event = group; - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - - list_for_each_entry(event, head, hw.cqm_group_entry) { - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - } - } - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -static u64 update_sample(unsigned int rmid, u32 evt_type, int first) -{ - struct sample *mbm_current; - u32 vrmid = rmid_2_index(rmid); - u64 val, bytes, shift; - u32 eventid; - - if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { - mbm_current = &mbm_local[vrmid]; - eventid = QOS_MBM_LOCAL_EVENT_ID; - } else { - mbm_current = &mbm_total[vrmid]; - eventid = QOS_MBM_TOTAL_EVENT_ID; - } - - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return mbm_current->total_bytes; - - if (first) { - mbm_current->prev_msr = val; - mbm_current->total_bytes = 0; - return mbm_current->total_bytes; - } - - /* - * The h/w guarantees that counters will not overflow - * so long as we poll them at least once per second. - */ - shift = 64 - MBM_CNTR_WIDTH; - bytes = (val << shift) - (mbm_current->prev_msr << shift); - bytes >>= shift; - - bytes *= cqm_l3_scale; - - mbm_current->total_bytes += bytes; - mbm_current->prev_msr = val; - - return mbm_current->total_bytes; -} - -static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) -{ - return update_sample(rmid, evt_type, 0); -} - -static void __intel_mbm_event_init(void *info) -{ - struct rmid_read *rr = info; - - update_sample(rr->rmid, rr->evt_type, 1); -} - -static void init_mbm_sample(u32 rmid, u32 evt_type) -{ - struct rmid_read rr = { - .rmid = rmid, - .evt_type = evt_type, - .value = ATOMIC64_INIT(0), - }; - - /* on each socket, init sample */ - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - event->hw.is_group_event = false; - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - if (is_mbm_event(event->attr.config)) - val = rmid_read_mbm(rmid, event->attr.config); - else - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static void __intel_mbm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = rmid_read_mbm(rr->rmid, rr->evt_type); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - atomic64_add(val, &rr->value); -} - -static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct perf_event *iter, *iter1; - int ret = HRTIMER_RESTART; - struct list_head *head; - unsigned long flags; - u32 grp_rmid; - - /* - * Need to cache_lock as the timer Event Select MSR reads - * can race with the mbm/cqm count() and mbm_init() reads. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (list_empty(&cache_groups)) { - ret = HRTIMER_NORESTART; - goto out; - } - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - grp_rmid = iter->hw.cqm_rmid; - if (!__rmid_valid(grp_rmid)) - continue; - if (is_mbm_event(iter->attr.config)) - update_sample(grp_rmid, iter->attr.config, 0); - - head = &iter->hw.cqm_group_entry; - if (list_empty(head)) - continue; - list_for_each_entry(iter1, head, hw.cqm_group_entry) { - if (!iter1->hw.is_group_event) - break; - if (is_mbm_event(iter1->attr.config)) - update_sample(iter1->hw.cqm_rmid, - iter1->attr.config, 0); - } - } - - hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return ret; -} - -static void __mbm_start_timer(void *info) -{ - hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), - HRTIMER_MODE_REL_PINNED); -} - -static void __mbm_stop_timer(void *info) -{ - hrtimer_cancel(&mbm_timers[pkg_id]); -} - -static void mbm_start_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); -} - -static void mbm_stop_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); -} - -static void mbm_hrtimer_init(void) -{ - struct hrtimer *hr; - int i; - - for (i = 0; i < mbm_socket_max; i++) { - hr = &mbm_timers[i]; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = mbm_hrtimer_handle; - } -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .evt_type = event->attr.config, - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values except in case of - * multiple events in the same group, we still need to read the - * other events.This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event) && !event->hw.is_group_event) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - cqm_mask_call(&rr); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - unsigned long flags; - - mutex_lock(&cache_mutex); - /* - * Hold the cache_lock as mbm timer handlers could be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - /* - * Stop the mbm overflow timers when the last event is destroyed. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_stop_timers(); - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - unsigned long flags; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || - (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) - return -EINVAL; - - if ((is_cqm_event(event->attr.config) && !cqm_enabled) || - (is_mbm_event(event->attr.config) && !mbm_enabled)) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* - * Start the mbm overflow timers when the first event is created. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_start_timers(); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - /* - * Hold the cache_lock as mbm timer handlers be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); -EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); -EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); -EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); - -EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); -EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); -EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); -EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute *intel_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - NULL, -}; - -static struct attribute *intel_cmt_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int reader; - - /* First online cpu in package becomes the reader */ - reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); - if (reader >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static int intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); - - cqm_pick_event_reader(cpu); - return 0; -} - -static int intel_cqm_cpu_exit(unsigned int cpu) -{ - int target; - - /* Is @cpu the current cqm reader for this package ? */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return 0; - - /* Find another online reader in this package */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &cqm_cpumask); - - return 0; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static void mbm_cleanup(void) -{ - if (!mbm_enabled) - return; - - kfree(mbm_local); - kfree(mbm_total); - mbm_enabled = false; -} - -static const struct x86_cpu_id intel_mbm_local_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, - {} -}; - -static const struct x86_cpu_id intel_mbm_total_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, - {} -}; - -static int intel_mbm_init(void) -{ - int ret = 0, array_size, maxid = cqm_max_rmid + 1; - - mbm_socket_max = topology_max_packages(); - array_size = sizeof(struct sample) * maxid * mbm_socket_max; - mbm_local = kmalloc(array_size, GFP_KERNEL); - if (!mbm_local) - return -ENOMEM; - - mbm_total = kmalloc(array_size, GFP_KERNEL); - if (!mbm_total) { - ret = -ENOMEM; - goto out; - } - - array_size = sizeof(struct hrtimer) * mbm_socket_max; - mbm_timers = kmalloc(array_size, GFP_KERNEL); - if (!mbm_timers) { - ret = -ENOMEM; - goto out; - } - mbm_hrtimer_init(); - -out: - if (ret) - mbm_cleanup(); - - return ret; -} - -static int __init intel_cqm_init(void) -{ - char *str = NULL, scale[20]; - int cpu, ret; - - if (x86_match_cpu(intel_cqm_match)) - cqm_enabled = true; - - if (x86_match_cpu(intel_mbm_local_match) && - x86_match_cpu(intel_mbm_total_match)) - mbm_enabled = true; - - if (!cqm_enabled && !mbm_enabled) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - if (mbm_enabled) - ret = intel_mbm_init(); - if (ret && !cqm_enabled) - goto out; - - if (cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; - else if (!cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_mbm_events_attr; - else if (cqm_enabled && !mbm_enabled) - intel_cqm_events_group.attrs = intel_cqm_events_attr; - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) { - pr_err("Intel CQM perf registration failed: %d\n", ret); - goto out; - } - - if (cqm_enabled) - pr_info("Intel CQM monitoring enabled\n"); - if (mbm_enabled) - pr_info("Intel MBM enabled\n"); - - /* - * Setup the hot cpu notifier once we are sure cqm - * is enabled to avoid notifier leak. - */ - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, - "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); -out: - cpus_read_unlock(); - - if (ret) { - kfree(str); - cqm_cleanup(); - mbm_cleanup(); - } - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a322fed5f8ed..e1965e5ff570 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -49,34 +49,47 @@ union intel_x86_pebs_dse { */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status) /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event, else regs->flags &= ~PERF_EFLAGS_EXACT; - if ((sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 955457a30197..8a6bbacd17dc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -109,6 +109,9 @@ enum { X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].in_tx = 0; cpuc->lbr_entries[i].abort = 0; cpuc->lbr_entries[i].cycles = 0; + cpuc->lbr_entries[i].type = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; cpuc->lbr_entries[out].cycles = cycles; + cpuc->lbr_entries[out].type = 0; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_CALL) mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +static int +common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) bool compress = false; /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; for (i = 0; i < cpuc->lbr_stack.nr; i++) { @@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].from = 0; compress = true; } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); } if (!compress) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index ae8324d65e61..81fd41d5a0d9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; + /* First round: clear STATUS, in particular the PSB byte counter. */ + if (!event->hw.config) { + perf_event_itrace_started(event); wrmsrl(MSR_IA32_RTIT_STATUS, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 476aec3a4cab..4196f81ec0e1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. @@ -558,6 +558,7 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; + struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; @@ -591,7 +592,8 @@ struct x86_pmu { pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -741,6 +743,8 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); +int x86_pmu_max_precise(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); @@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void); void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..e0bb46c02857 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 66ac08607471..42bbbf0f173d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,7 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..1a2ba368da39 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..346d252029b7 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -11,34 +11,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +43,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index a3de31ffb722..04330c8d9af9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 07b06955a05d..aa15d1f7e530 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,20 +13,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, - smp_kvm_posted_intr_nested_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 255645f60ca2..554cdb205d17 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { - copy_kernel_to_xregs(&fpstate->xsave, -1); + copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); @@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpstate); + __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafbd4207..6dfe366a8804 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 4bc6f459a8b6..c40a95c33bb8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca540025..9958ceea2fa3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..ddbb8ea0f5a9 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7cbaab523f22..369e41c23f07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -492,6 +492,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 pkru; u32 hflags; u64 efer; u64 apic_base; @@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index d25d9f4abb15..7ae318c340d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -148,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 0d4b01c5e438..63cc96f064dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -30,6 +30,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9ccac1926587..c25dd22f7c70 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -960,11 +960,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9ffc36bfe4cd..6b64fc6367f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -196,9 +196,6 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c61bab07a84e..3fa26a61eabc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -667,7 +668,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -690,6 +691,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -699,6 +701,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..b408b1886195 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..066aaf813141 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,9 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index e4585a393965..a65cf544686a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,7 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..5161da1a0fa0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +123,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +139,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d23e61dc0640..4893abf7f74f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -198,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +extern void initialize_tlbstate_and_flush(void); + static inline void __native_flush_tlb(void) { /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..5665bf205b8d 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..1599d394c8c1 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..5545f6459bf5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +35,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 30269dafec47..184eb9894dae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fcd630e..9606688caa4b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 39bca7fac087..3be08f07695c 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,9 +3,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* * Take the 4 protection key bits out of the vma->vm_flags diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a01892bdd61a..fd0a7895b63f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 97bb2caf3428..f8ae286c1502 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -118,7 +118,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!phys || !size) @@ -127,7 +127,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; @@ -199,8 +199,10 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC int apic_id; u8 enabled; +#endif processor = (struct acpi_madt_local_x2apic *)header; @@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); +#ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; -#ifdef CONFIG_X86_X2APIC + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1083,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98b3dd8cf2bf..7834f73efbf1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -177,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ @@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = { static void apic_check_deadline_errata(void) { - const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + const struct x86_cpu_id *m; u32 rev; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + m = x86_match_cpu(deadline_match); if (!m) return; @@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 237e9c2341c7..70e48aa6af98 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b3af457ed667..88c214e75a6b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 99332f550c48..cf42206926af 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,7 +20,6 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 110ca5d2bb87..9862e2cd6d93 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -297,13 +297,29 @@ static int nearby_node(int apicid) } #endif +#ifdef CONFIG_SMP +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + /* * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u8 node_id; @@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } else return; - /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cus_per_node; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cus_per_node = c->x86_max_cores / nodes_per_socket; - - /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cus_per_node; + legacy_fixup_core_id(c); } } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b95cd94ca97b..fb1d3358a4af 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PGE)) { + /* + * We'd like to use cr4_set_bits_and_update_boot(), + * but we can't. CR4.PCIDE is special and can only + * be set in long mode, and the early CPU init code + * doesn't know this and would try to restore CR4.PCIDE + * prior to entering long mode. + * + * Instead, we rely on the fact that hotplug, resume, + * etc all fully restore CR4 before they write anything + * that could have nonzero PCID bits to CR3. CR4.PCIDE + * has no effect on the page tables themselves, so we + * don't need it to be restored early. + */ cr4_set_bits(X86_CR4_PCIDE); } else { /* @@ -1329,15 +1342,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1592,6 +1596,7 @@ void cpu_init(void) mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1646,6 +1651,7 @@ void cpu_init(void) mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c55fb2cb2acc..24f749324c0f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; @@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, &this_leaf->shared_cpu_map); } } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } } else return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9e314bcf67cc..40e28ed77fbf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { @@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; @@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -static inline void __smp_deferred_error_interrupt(void) -{ - inc_irq_stat(irq_deferred_error_count); - deferred_error_int_vector(); -} - asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); - __smp_deferred_error_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) -{ - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index f7370abd33c6..2da67b70ba98 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -static inline void __smp_thermal_interrupt(void) -{ - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); -} - -asmlinkage __visible void __irq_entry -smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_thermal_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry -smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index bb0e75eed10a..5e7249e42f8f 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,24 +17,12 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -static inline void __smp_threshold_interrupt(void) -{ - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); -} - asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); - __smp_threshold_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) -{ - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 21b185793c80..c6daec4bdba5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch) list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) + if (p->patch_id >= new_patch->patch_id) { /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); return; + } list_replace(&p->plist, &new_patch->plist); kfree(p->data); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 59edbe9d4ccb..8f7a9bbad514 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header, return false; } -static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) +static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) - return ERR_PTR(-ENOMEM); + return NULL; p->data = kmemdup(data, size, GFP_KERNEL); if (!p->data) { kfree(p); - return ERR_PTR(-ENOMEM); + return NULL; } return p; @@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size) if (mc_hdr->rev <= mc_saved_hdr->rev) continue; - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer %p\n", data); else list_replace(&iter->plist, &p->plist); @@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size) * newly found. */ if (!prev_found) { - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer for %p\n", data); else list_add_tail(&p->plist, µcode_cache); } + if (!p) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before * paging has been enabled. */ - if (p) { - if (IS_ENABLED(CONFIG_X86_32)) - intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); - else - intel_ucode_patch = p->data; - } + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; } static int microcode_sanity_check(void *mc, int print_err) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index daefd67a66c7..3b3f713e15e5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* - * Setup the IDT for hypervisor callback. Prevent reallocation - * at module reload. - */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); } void hv_remove_vmbus_irq(void) @@ -184,9 +179,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +220,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -254,7 +255,7 @@ static void __init ms_hyperv_init_platform(void) } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..4f0481474903 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..225af4184f06 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index d907c3d8633f..927abeaf63e2 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,10 +12,10 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/dmi.h> #include <linux/pci_ids.h> #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> +#include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), }; static void __init @@ -593,7 +594,7 @@ static void __init apple_airport_reset(int bus, int slot, int func) u64 addr; int i; - if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + if (!x86_apple_machine) return; /* Card may have been put into PCI_D3hot by grub quirk */ diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000000..f260e452e4f8 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,19 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include <linux/ioport.h> +#include <linux/eisa.h> +#include <linux/io.h> + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec012b371..cf2ce063f65a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820/api.h> @@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); + + idt_setup_early_handler(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a193b93fd95..bab4fa579450 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -311,8 +311,6 @@ static void __init copy_bootdata(char *real_mode_data) asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -345,9 +343,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)&idt_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..9ed3074d0d27 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -347,7 +345,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -380,37 +377,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* - * Set up a idt with 256 interrupt gates that push zero if there - * is no error code and then jump to early_idt_handler_common. - * It doesn't actually load the idt - that needs to be done on - * each CPU. Interrupts are enabled elsewhere, when we can be - * relatively sure everything is ok. - */ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -457,12 +423,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +442,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,18 +455,17 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -539,7 +500,8 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) + __INITDATA .align 4 GLOBAL(early_recursion_flag) @@ -628,7 +590,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -637,11 +598,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000000..6107ee1cb8d5 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,371 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include <linux/interrupt.h> + +#include <asm/traps.h> +#include <asm/proto.h> +#include <asm/desc.h> + +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate with interrupt stack */ +#define SISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initdata struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_XF, simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, double_fault), +#endif + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BP, int3), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, &machine_check), +#endif + + SYSG(X86_TRAP_OF, overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initdata struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), + INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), + INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), + INTG(REBOOT_VECTOR, reboot_interrupt), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, thermal_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, irq_work_interrupt), +# endif + INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(ERROR_APIC_VECTOR, error_interrupt), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .address = (unsigned long) idt_table, +}; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi, NMI_STACK), + SISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), +#endif +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; +#endif + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, used_vectors); + } +} + +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(&data, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment = __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); +} +#endif + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init update_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); +} + +void alloc_intr_gate(unsigned int n, const void *addr) +{ + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4ed0aba8dbc8..52089c043160 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -259,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} @@ -334,19 +331,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) } #endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); -} - -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU @@ -431,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; - vector < first_system_vector; vector++) { + vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 275487872be2..70dee056f92b 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -11,35 +11,23 @@ #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> -static inline void __smp_irq_work_interrupt(void) -{ - inc_irq_stat(apic_irq_work_irqs); - irq_work_run(); -} - +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_irq_work_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c7fd18526c3e..1add9e08e83e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; @@ -99,100 +87,12 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); - - /* IPI used for rebooting/stopping */ - alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_THERMAL_VECTOR - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#ifdef CONFIG_X86_MCE_THRESHOLD - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#ifdef CONFIG_X86_MCE_AMD - alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for X86 platform specific use */ - alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); -#ifdef CONFIG_HAVE_KVM - /* IPI for KVM to deliver posted interrupt */ - alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); - /* IPI for KVM to deliver interrupt to wake up tasks */ - alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); - /* IPI for KVM to deliver nested posted interrupt */ - alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); -#endif - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* IRQ work interrupts: */ -# ifdef CONFIG_IRQ_WORK - alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); -# endif - -#endif -} - void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - apic_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif + idt_setup_apic_and_irq_gates(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 69ea0bc1cfa3..4f98aad38237 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -39,6 +39,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/sections.h> #include "common.h" @@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr) /* * Do not optimize in the entry code due to the unstable - * stack handling. + * stack handling and registers setup. */ - if ((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) || + ((paddr >= (unsigned long)__irqentry_text_start) && + (paddr < (unsigned long)__irqentry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d04e30e3c0ff..874827b0d7ca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ @@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a870910c8565..f0e64db18ac8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -21,6 +21,25 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *__mm) { @@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) pc = &mm->context; set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d7a1bc..00bc751c861c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..62e7d70aadd5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 446c8aa09b9b..35aafc95e4b8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -39,26 +39,26 @@ #include <trace/events/nmi.h> struct nmi_desc { - spinlock_t lock; + raw_spinlock_t lock; struct list_head head; }; static struct nmi_desc nmi_desc[NMI_MAX] = { { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), .head = LIST_HEAD_INIT(nmi_desc[0].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), .head = LIST_HEAD_INIT(nmi_desc[2].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), .head = LIST_HEAD_INIT(nmi_desc[3].head), }, @@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) init_irq_work(&action->irq_work, nmi_max_handler); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * Indicate if there are multiple registrations on the @@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) else list_add_tail_rcu(&action->list, &desc->head); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(__register_nmi_handler); @@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) struct nmiaction *n; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); list_for_each_entry_rcu(n, &desc->head, list) { /* @@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bc0a849589bb..a14df9eecfed 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c6d6dc5f8bb2..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c3169be4c596..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 0bee04d41bed..eaa591cfd98b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -1,6 +1,7 @@ /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ +#include <linux/dmi.h> #include <linux/pci.h> #include <linux/irq.h> @@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif #endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a56bf6051f4e..54984b142641 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -38,8 +38,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(&no_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0bfe0c1628f6..d84afb0a322d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,6 +116,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -899,7 +900,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1170,7 +1171,7 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1215,6 +1216,8 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); + early_platform_quirks(); + /* * Parse the ACPI tables for possible boot-time SMP configuration. */ @@ -1319,6 +1322,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e69a68..6e8fcb6f7e1e 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); - pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..e04442345fc0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d798c0da451c..5c574dff4c1a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -254,84 +254,45 @@ finish: } /* - * Reschedule call back. + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode */ -static inline void __smp_reschedule_interrupt(void) -{ - inc_irq_stat(irq_resched_count); - scheduler_ipi(); -} - __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - __smp_reschedule_interrupt(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} - -__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) -{ - /* - * Need to call irq_enter() before calling the trace point. - * __smp_reschedule_interrupt() calls irq_enter/exit() too (in - * scheduler_ipi(). This is OK, since those functions are allowed - * to nest. - */ - ipi_entering_ack_irq(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} + inc_irq_stat(irq_resched_count); -static inline void __smp_call_function_interrupt(void) -{ - generic_smp_call_function_interrupt(); - inc_irq_stat(irq_call_count); + if (trace_resched_ipi_enabled()) { + /* + * scheduler_ipi() might call irq_enter() as well, but + * nested calls are fine. + */ + irq_enter(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + irq_exit(); + return; + } + scheduler_ipi(); } __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_call_function_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry -smp_trace_call_function_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); - trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); -} - -static inline void __smp_call_function_single_interrupt(void) -{ - generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); -} - -__visible void __irq_entry -smp_call_function_single_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); - __smp_call_function_single_interrupt(); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); exiting_irq(); } -__visible void __irq_entry -smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 5f25cfbd952e..5ee663836c08 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699baea1b..a106b9719c58 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 15515132bf0d..c6636d1f60b9 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -4,57 +4,38 @@ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> * */ -#include <asm/hw_irq.h> -#include <asm/desc.h> +#include <linux/jump_label.h> #include <linux/atomic.h> -atomic_t trace_idt_ctr = ATOMIC_INIT(0); -struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +#include <asm/hw_irq.h> +#include <asm/desc.h> -static int trace_irq_vector_refcount; -static DEFINE_MUTEX(irq_vector_mutex); +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -static void set_trace_idt_ctr(int val) +int trace_pagefault_reg(void) { - atomic_set(&trace_idt_ctr, val); - /* Ensure the trace_idt_ctr is set before sending IPI */ - wmb(); + static_branch_inc(&trace_pagefault_key); + return 0; } -static void switch_idt(void *arg) +void trace_pagefault_unreg(void) { - unsigned long flags; - - local_irq_save(flags); - load_current_idt(); - local_irq_restore(flags); + static_branch_dec(&trace_pagefault_key); } -int trace_irq_vector_regfunc(void) +#ifdef CONFIG_SMP + +DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); + +int trace_resched_ipi_reg(void) { - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - trace_irq_vector_refcount++; - mutex_unlock(&irq_vector_mutex); + static_branch_inc(&trace_resched_ipi_key); return 0; } -void trace_irq_vector_unregfunc(void) +void trace_resched_ipi_unreg(void) { - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(0); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - mutex_unlock(&irq_vector_mutex); + static_branch_dec(&trace_resched_ipi_key); } + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309b85da..34ea3651362e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include <linux/smp.h> #include <linux/io.h> -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -70,20 +65,13 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> #include <asm/proto.h> #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* - * Don't use IST to set DEBUG_STACK as it doesn't work until TSS - * is ready in cpu_init() <-- trap_init(). Before trap_init(), - * CPU runs at ring 0 so it is impossible to hit an invalid - * stack. Using the original stack works well enough at this - * early stage. DEBUG_STACK will be equipped after cpu_init() in - * trap_init(). - * - * We don't need to set trace_idt_table like set_intr_gate(), - * since we don't have trace_debug and it will be reset to - * 'debug' in trap_init() by set_intr_gate_ist(). - */ - set_intr_gate_notrace(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, &int3); -#ifdef CONFIG_X86_32 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif - load_idt(&idt_descr); -} - -void __init early_trap_pf_init(void) -{ -#ifdef CONFIG_X86_64 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif -} - void __init trap_init(void) { - int i; - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(X86_TRAP_DE, divide_error); - set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); - /* int4 can be called from all */ - set_system_intr_gate(X86_TRAP_OF, &overflow); - set_intr_gate(X86_TRAP_BR, bounds); - set_intr_gate(X86_TRAP_UD, invalid_op); - set_intr_gate(X86_TRAP_NM, device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); - set_intr_gate(X86_TRAP_TS, invalid_TSS); - set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate(X86_TRAP_SS, stack_segment); - set_intr_gate(X86_TRAP_GP, general_protection); - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); - set_intr_gate(X86_TRAP_MF, coprocessor_error); - set_intr_gate(X86_TRAP_AC, alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); -#endif - set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif - -#ifdef CONFIG_X86_32 - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif + idt_setup_traps(); /* * Set the IDT descriptor to a fixed read-only location, so that the @@ -1030,20 +940,9 @@ void __init trap_init(void) */ cpu_init(); - /* - * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, ITS works only after - * cpu_init() loads TSS. See comments in early_trap_init(). - */ - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, &debug); - set_nmi_gate(X86_TRAP_BP, &int3); -#endif + idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..d145a0b1f529 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ @@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip) if (addr >= __entry_text_start && addr < __entry_text_end) return true; -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) if (addr >= __irqentry_text_start && addr < __irqentry_text_end) return true; -#endif return false; } diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..570b70d3f604 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,582 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", + idx, lookup_num_blocks, ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", + idx, lookup_num_blocks, start, stop, ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %p\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %p\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%p\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..f05f00acac89 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +149,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59ca2eea522c..19adbb418443 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled) + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 762cdf2595f9..e1e89ee4af75 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } -static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_pkru(vcpu); -} - static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d7d248a000dd..4b9a3ae6b725 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * index of the protection domain, so pte_pkey * 2 is * is the index of the first bit for the domain. */ - pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 099ff08b4aff..8dbd8dbc83eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } -static u32 svm_get_pkru(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -5414,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .get_pkru = svm_get_pkru, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 416d5ed320b6..70b90c0810d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -636,8 +636,6 @@ struct vcpu_vmx { u64 current_tsc_ratio; - bool guest_pkru_valid; - u32 guest_pkru; u32 host_pkru; /* @@ -2383,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; -} - static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -8786,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) vector = exit_intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(*desc); + entry = gate_offset(desc); asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" @@ -9020,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -9169,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * back on host, so it is safe to read guest PKRU from current * XSAVE. */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; } /* @@ -11682,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eda4bdbd7e5e..ef5102f80497 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3246,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest + offset, src, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(dest + offset, &vcpu->arch.pkru, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest + offset, src, size); + } valid -= feature; @@ -3284,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest, src + offset, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(&vcpu->arch.pkru, src + offset, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest, src + offset, size); } valid -= feature; @@ -6726,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7634,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); trace_kvm_fpu(1); } diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index f77ba3058b31..066996dba6a2 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -363,3 +363,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ +ENDPROC(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 47099628fa4c..2c71527bd917 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -44,4 +44,4 @@ ENTRY(FPU_div_small) leave ret - +ENDPROC(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203baefb5c0..d4a7df2205b8 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(&code_descriptor); + code_limit = seg_get_limit(&code_descriptor) + 1; + code_limit *= seg_get_granularity(&code_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0xffffffff; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index a179254a5122..699f329f1d40 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ - | (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) -#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ - == (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit1 << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b8ef9f9d2ffc..c48967c6a0e2 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment, } descriptor = FPU_get_ldt_descriptor(addr->selector); - base_address = SEG_BASE_ADDR(descriptor); + base_address = seg_get_base(&descriptor); address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + limit = seg_get_limit(&descriptor) + 1; + limit *= seg_get_granularity(&descriptor); + limit += base_address - 1; if (limit < base_address) limit = 0xffffffff; - if (SEG_EXPAND_DOWN(descriptor)) { - if (SEG_G_BIT(descriptor)) + if (seg_expands_down(&descriptor)) { + if (descriptor.g) { seg_top = 0xffffffff; - else { + } else { seg_top = base_address + (1 << 20); if (seg_top < base_address) seg_top = 0xffffffff; @@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment, (address > limit) || (address < base_address) ? 0 : ((limit - address) >= 254 ? 255 : limit - address + 1); } - if (SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + if (seg_execute_only(&descriptor) || + (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { access_limit = 0; } return address; diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 717785a53eb4..22e0631bb85a 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -62,6 +62,7 @@ ENTRY(mul32_Xsig) popl %esi leave ret +ENDPROC(mul32_Xsig) ENTRY(mul64_Xsig) @@ -114,6 +115,7 @@ ENTRY(mul64_Xsig) popl %esi leave ret +ENDPROC(mul64_Xsig) @@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret - +ENDPROC(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 17315c89ff3d..a9aaf414135d 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -133,3 +133,4 @@ L_accum_done: popl %esi leave ret +ENDPROC(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 8b6352efceef..53ac1a343c69 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -94,6 +94,7 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit +ENDPROC(FPU_normalize) @@ -145,3 +146,4 @@ L_exit_nuo_zero: popl %ebx leave ret +ENDPROC(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index d1d4e48b4f67..41af5b208d88 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -706,3 +706,5 @@ L_exception_exit: mov $-1,%eax jmp fpu_reg_round_special_exit #endif /* PARANOID */ + +ENDPROC(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 47c4c2434d85..3b1bc5e9b2f6 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -165,3 +165,4 @@ L_exit: leave ret #endif /* PARANOID */ +ENDPROC(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index cc00654b6f9a..796eb5ab921b 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -469,3 +469,5 @@ L_exit: leave ret #endif /* PARANOID */ + +ENDPROC(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 973f12af97df..6196f68cf3c1 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -146,3 +146,4 @@ L_exit: ret #endif /* PARANOID */ +ENDPROC(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index 1b6c24801d22..d115b900919a 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -270,3 +270,4 @@ L_exit: popl %esi leave ret +ENDPROC(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index bbe0e87718e4..87c99749a495 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -78,7 +78,7 @@ L_exit: popl %ebx leave ret - +ENDPROC(round_Xsig) @@ -138,4 +138,4 @@ L_n_exit: popl %ebx leave ret - +ENDPROC(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 31cdd118e918..c8552edeec75 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -85,3 +85,4 @@ L_more_than_95: popl %esi leave ret +ENDPROC(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 518428317985..340dd6897f85 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -92,6 +92,7 @@ L_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -202,3 +203,4 @@ Ls_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..695afae38fdf 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -468,3 +468,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result +ENDPROC(wm_sqrt) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0cdf14cf3270..b836a7274e12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1258,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1494,27 +1490,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1525,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..d805162e6045 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ce104b962a17..dbbcfd59726a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -214,6 +214,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_CR4_PCIDE) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ + write_cr3(cr3 & ~CR3_PCID_MASK); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} + +/* * flush_tlb_func_common()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1324f280e06..8c9573660d51 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -94,7 +94,9 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JNE 0x75 #define X86_JBE 0x76 #define X86_JA 0x77 +#define X86_JL 0x7C #define X86_JGE 0x7D +#define X86_JLE 0x7E #define X86_JG 0x7F static void bpf_flush_icache(void *start, void *end) @@ -285,7 +287,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 47 /* number of bytes to jump */ +#define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -294,21 +296,20 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 36 +#define OFFSET2 32 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) * goto out; */ - EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 10 EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; @@ -888,9 +889,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39, add_2reg(0xC0, dst_reg, src_reg)); @@ -911,9 +916,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* cmp dst_reg, imm8/32 */ EMIT1(add_1mod(0x48, dst_reg)); @@ -935,18 +944,34 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ /* GT is unsigned '>', JA in x86 */ jmp_cond = X86_JA; break; + case BPF_JLT: + /* LT is unsigned '<', JB in x86 */ + jmp_cond = X86_JB; + break; case BPF_JGE: /* GE is unsigned '>=', JAE in x86 */ jmp_cond = X86_JAE; break; + case BPF_JLE: + /* LE is unsigned '<=', JBE in x86 */ + jmp_cond = X86_JBE; + break; case BPF_JSGT: /* signed '>', GT in x86 */ jmp_cond = X86_JG; break; + case BPF_JSLT: + /* signed '<', LT in x86 */ + jmp_cond = X86_JL; + break; case BPF_JSGE: /* signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; + case BPF_JSLE: + /* signed '<=', LE in x86 */ + jmp_cond = X86_JLE; + break; default: /* to silence gcc warning */ return -EFAULT; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 78459a6d455a..4d68d59f457d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -181,6 +181,7 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ + initialize_tlbstate_and_flush(); fpu__resume_cpu(); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ae4cd58c0c7a..02250b2633b8 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else -#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET) +#ifdef FP_XSTATE_MAGIC1 DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long)); #else DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index df1921751aa5..ae2a2e2d6362 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t, @@ -586,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } +#ifdef CONFIG_X86_64 +struct trap_array_entry { + void (*orig)(void); + void (*xen)(void); + bool ist_okay; +}; + +static struct trap_array_entry trap_array[] = { + { debug, xen_xendebug, true }, + { int3, xen_xenint3, true }, + { double_fault, xen_double_fault, true }, +#ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, +#endif + { nmi, xen_nmi, true }, + { overflow, xen_overflow, false }, +#ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +#endif + { page_fault, xen_page_fault, false }, + { divide_error, xen_divide_error, false }, + { bounds, xen_bounds, false }, + { invalid_op, xen_invalid_op, false }, + { device_not_available, xen_device_not_available, false }, + { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + { invalid_TSS, xen_invalid_TSS, false }, + { segment_not_present, xen_segment_not_present, false }, + { stack_segment, xen_stack_segment, false }, + { general_protection, xen_general_protection, false }, + { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + { coprocessor_error, xen_coprocessor_error, false }, + { alignment_check, xen_alignment_check, false }, + { simd_coprocessor_error, xen_simd_coprocessor_error, false }, +}; + +static bool get_trap_addr(void **addr, unsigned int ist) +{ + unsigned int nr; + bool ist_okay = false; + + /* + * Replace trap handler addresses by Xen specific ones. + * Check for known traps using IST and whitelist them. + * The debugger ones are the only ones we care about. + * Xen will handle faults like double_fault, * so we should never see + * them. Warn if there's an unexpected IST-using fault handler. + */ + for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { + struct trap_array_entry *entry = trap_array + nr; + + if (*addr == entry->orig) { + *addr = entry->xen; + ist_okay = entry->ist_okay; + break; + } + } + + if (WARN_ON(ist != 0 && !ist_okay)) + return false; + + return true; +} +#endif + static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { unsigned long addr; - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) return 0; info->vector = vector; - addr = gate_offset(*val); + addr = gate_offset(val); #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ + if (!get_trap_addr((void **)&addr, val->bits.ist)) return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } #endif /* CONFIG_X86_64 */ info->address = addr; - info->cs = gate_segment(*val); - info->flags = val->dpl; + info->cs = gate_segment(val); + info->flags = val->bits.dpl; /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) + if (val->bits.type == GATE_INTERRUPT) info->flags |= 1 << 2; return 1; @@ -988,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1050,10 +1029,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1251,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 33e92955e09d..d4eff5676cfa 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { .safe_halt = xen_safe_halt, .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif }; void __init xen_init_irq_ops(void) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..dcd31fa39b5d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +10,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..1200e262a116 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +12,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..dae2cc33afb5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,13 +14,44 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> + +.macro xen_pv_trap name +ENTRY(xen_\name) + pop %rcx + pop %r11 + jmp \name +END(xen_\name) +.endm -ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx - mov 8+8(%rsp), %r11 - ret $16 -ENDPROC(xen_adjust_exception_frame) +xen_pv_trap divide_error +xen_pv_trap debug +xen_pv_trap xendebug +xen_pv_trap int3 +xen_pv_trap xenint3 +xen_pv_trap nmi +xen_pv_trap overflow +xen_pv_trap bounds +xen_pv_trap invalid_op +xen_pv_trap device_not_available +xen_pv_trap double_fault +xen_pv_trap coprocessor_segment_overrun +xen_pv_trap invalid_TSS +xen_pv_trap segment_not_present +xen_pv_trap stack_segment +xen_pv_trap general_protection +xen_pv_trap page_fault +xen_pv_trap spurious_interrupt_bug +xen_pv_trap coprocessor_error +xen_pv_trap alignment_check +#ifdef CONFIG_X86_MCE +xen_pv_trap machine_check +#endif /* CONFIG_X86_MCE */ +xen_pv_trap simd_coprocessor_error +#ifdef CONFIG_IA32_EMULATION +xen_pv_trap entry_INT80_compat +#endif +xen_pv_trap hypervisor_callback hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* @@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +88,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d5004477db6..c8a6d224f7ed 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); __visible void xen_sysret64(void); -__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); |