aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild3
-rw-r--r--arch/x86/Kconfig24
-rw-r--r--arch/x86/Kconfig.debug59
-rw-r--r--arch/x86/Makefile17
-rw-r--r--arch/x86/boot/compressed/eboot.c10
-rw-r--r--arch/x86/boot/compressed/head_32.S129
-rw-r--r--arch/x86/boot/compressed/head_64.S112
-rw-r--r--arch/x86/boot/compressed/kaslr.c90
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/header.S8
-rw-r--r--arch/x86/configs/tiny.config2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c4
-rw-r--r--arch/x86/crypto/blowfish_glue.c3
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c3
-rw-r--r--arch/x86/crypto/des3_ede_glue.c3
-rw-r--r--arch/x86/entry/Makefile1
-rw-r--r--arch/x86/entry/calling.h5
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/entry_32.S20
-rw-r--r--arch/x86/entry/entry_64.S237
-rw-r--r--arch/x86/entry/entry_64_compat.S12
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/events/amd/uncore.c21
-rw-r--r--arch/x86/events/core.c64
-rw-r--r--arch/x86/events/intel/Makefile2
-rw-r--r--arch/x86/events/intel/bts.c2
-rw-r--r--arch/x86/events/intel/core.c107
-rw-r--r--arch/x86/events/intel/cqm.c1766
-rw-r--r--arch/x86/events/intel/ds.c58
-rw-r--r--arch/x86/events/intel/lbr.c52
-rw-r--r--arch/x86/events/intel/pt.c5
-rw-r--r--arch/x86/events/perf_event.h10
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/atomic.h69
-rw-r--r--arch/x86/include/asm/atomic64_32.h81
-rw-r--r--arch/x86/include/asm/atomic64_64.h73
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/desc.h248
-rw-r--r--arch/x86/include/asm/desc_defs.h122
-rw-r--r--arch/x86/include/asm/elf.h15
-rw-r--r--arch/x86/include/asm/entry_arch.h17
-rw-r--r--arch/x86/include/asm/fpu/internal.h6
-rw-r--r--arch/x86/include/asm/futex.h40
-rw-r--r--arch/x86/include/asm/hw_irq.h20
-rw-r--r--arch/x86/include/asm/intel_rdt.h286
-rw-r--r--arch/x86/include/asm/intel_rdt_common.h27
-rw-r--r--arch/x86/include/asm/intel_rdt_sched.h92
-rw-r--r--arch/x86/include/asm/io.h98
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/irq_work.h8
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/lguest.h91
-rw-r--r--arch/x86/include/asm/lguest_hcall.h74
-rw-r--r--arch/x86/include/asm/mmu_context.h4
-rw-r--r--arch/x86/include/asm/module.h9
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/orc_lookup.h46
-rw-r--r--arch/x86/include/asm/orc_types.h107
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/include/asm/proto.h3
-rw-r--r--arch/x86/include/asm/ptrace.h43
-rw-r--r--arch/x86/include/asm/refcount.h109
-rw-r--r--arch/x86/include/asm/rmwcc.h37
-rw-r--r--arch/x86/include/asm/segment.h4
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/include/asm/trace/common.h16
-rw-r--r--arch/x86/include/asm/trace/exceptions.h8
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h51
-rw-r--r--arch/x86/include/asm/traps.h50
-rw-r--r--arch/x86/include/asm/uaccess.h7
-rw-r--r--arch/x86/include/asm/unwind.h76
-rw-r--r--arch/x86/include/asm/unwind_hints.h105
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h2
-rw-r--r--arch/x86/include/uapi/asm/mman.h3
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/apic/apic.c76
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c20
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c26
-rw-r--r--arch/x86/kernel/cpu/common.c24
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c32
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c375
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h440
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c (renamed from arch/x86/kernel/cpu/intel_rdt_schemata.c)67
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c499
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1117
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c16
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c21
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c4
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/eisa.c19
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_32.S66
-rw-r--r--arch/x86/kernel/idt.c371
-rw-r--r--arch/x86/kernel/irq.c40
-rw-r--r--arch/x86/kernel/irq_work.c20
-rw-r--r--arch/x86/kernel/irqinit.c102
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/ldt.c21
-rw-r--r--arch/x86/kernel/machine_kexec_32.c14
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/nmi.c18
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/platform-quirks.c1
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c244
-rw-r--r--arch/x86/kernel/quirks.c10
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smp.c81
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/tls.c2
-rw-r--r--arch/x86/kernel/tracepoint.c57
-rw-r--r--arch/x86/kernel/traps.c107
-rw-r--r--arch/x86/kernel/unwind_frame.c41
-rw-r--r--arch/x86/kernel/unwind_guess.c5
-rw-r--r--arch/x86/kernel/unwind_orc.c582
-rw-r--r--arch/x86/kernel/vmlinux.lds.S3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/svm.c7
-rw-r--r--arch/x86/kvm/vmx.c27
-rw-r--r--arch/x86/kvm/x86.c28
-rw-r--r--arch/x86/lguest/Kconfig14
-rw-r--r--arch/x86/lguest/Makefile2
-rw-r--r--arch/x86/lguest/boot.c1558
-rw-r--r--arch/x86/lguest/head_32.S192
-rw-r--r--arch/x86/math-emu/div_Xsig.S1
-rw-r--r--arch/x86/math-emu/div_small.S2
-rw-r--r--arch/x86/math-emu/fpu_entry.c11
-rw-r--r--arch/x86/math-emu/fpu_system.h48
-rw-r--r--arch/x86/math-emu/get_address.c17
-rw-r--r--arch/x86/math-emu/mul_Xsig.S4
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S1
-rw-r--r--arch/x86/math-emu/reg_norm.S2
-rw-r--r--arch/x86/math-emu/reg_round.S2
-rw-r--r--arch/x86/math-emu/reg_u_add.S1
-rw-r--r--arch/x86/math-emu/reg_u_div.S2
-rw-r--r--arch/x86/math-emu/reg_u_mul.S1
-rw-r--r--arch/x86/math-emu/reg_u_sub.S1
-rw-r--r--arch/x86/math-emu/round_Xsig.S4
-rw-r--r--arch/x86/math-emu/shr_Xsig.S1
-rw-r--r--arch/x86/math-emu/wm_shrx.S2
-rw-r--r--arch/x86/math-emu/wm_sqrt.S1
-rw-r--r--arch/x86/mm/extable.c44
-rw-r--r--arch/x86/mm/fault.c49
-rw-r--r--arch/x86/mm/numa_emulation.c55
-rw-r--r--arch/x86/mm/tlb.c44
-rw-r--r--arch/x86/net/bpf_jit_comp.c35
-rw-r--r--arch/x86/power/cpu.c1
-rw-r--r--arch/x86/um/user-offsets.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c169
-rw-r--r--arch/x86/xen/irq.c3
-rw-r--r--arch/x86/xen/xen-asm.S26
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S27
-rw-r--r--arch/x86/xen/xen-asm_64.S102
-rw-r--r--arch/x86/xen/xen-ops.h16
183 files changed, 5850 insertions, 6351 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 3e6f64073005..0038a2d10a7a 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
# Hyper-V paravirtualization support
obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
obj-y += realmode/
obj-y += kernel/
obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 87e447286c37..4b278a33ccbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,8 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
+ # Causing hangs/crashes, see the commit that added this change for details.
+ select ARCH_HAS_REFCOUNT if BROKEN
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
@@ -73,7 +75,6 @@ config X86
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
@@ -158,6 +159,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
@@ -169,7 +171,7 @@ config X86
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -427,16 +429,16 @@ config GOLDFISH
def_bool y
depends on X86_GOLDFISH
-config INTEL_RDT_A
- bool "Intel Resource Director Technology Allocation support"
+config INTEL_RDT
+ bool "Intel Resource Director Technology support"
default n
depends on X86 && CPU_SUP_INTEL
select KERNFS
help
- Select to enable resource allocation which is a sub-feature of
- Intel Resource Director Technology(RDT). More information about
- RDT can be found in the Intel x86 Architecture Software
- Developer Manual.
+ Select to enable resource allocation and monitoring which are
+ sub-features of Intel Resource Director Technology(RDT). More
+ information about RDT can be found in the Intel x86
+ Architecture Software Developer Manual.
Say N if unsure.
@@ -780,8 +782,6 @@ config KVM_DEBUG_FS
Statistics are displayed in debugfs filesystem. Enabling this option
may incur significant overhead.
-source "arch/x86/lguest/Kconfig"
-
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
@@ -1806,7 +1806,9 @@ config X86_SMAP
config X86_INTEL_MPX
prompt "Intel MPX (Memory Protection Extensions)"
def_bool n
- depends on CPU_SUP_INTEL
+ # Note: only available in 64-bit mode due to VMA flags shortage
+ depends on CPU_SUP_INTEL && X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
---help---
MPX provides hardware features that can be used in
conjunction with compiler-instrumented code to check
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cd20ca0b4043..71a48a30fc84 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
Some of these sanity checks may slow down kernel entries and
exits or otherwise impact performance.
- This is currently used to help test NMI code.
-
If unsure, say N.
config DEBUG_NMI_SELFTEST
@@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG
The current power state can be read from
/sys/kernel/debug/punit_atom/dev_power_state
+choice
+ prompt "Choose kernel unwinder"
+ default FRAME_POINTER_UNWINDER
+ ---help---
+ This determines which method will be used for unwinding kernel stack
+ traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+ livepatch, lockdep, and more.
+
+config FRAME_POINTER_UNWINDER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config ORC_UNWINDER
+ bool "ORC unwinder"
+ depends on X86_64
+ select STACK_VALIDATION
+ ---help---
+ This option enables the ORC (Oops Rewind Capability) unwinder for
+ unwinding kernel stack traces. It uses a custom data format which is
+ a simplified version of the DWARF Call Frame Information standard.
+
+ This unwinder is more accurate across interrupt entry frames than the
+ frame pointer unwinder. It also enables a 5-10% performance
+ improvement across the entire kernel compared to frame pointers.
+
+ Enabling this option will increase the kernel's runtime memory usage
+ by roughly 2-4MB, depending on your kernel config.
+
+config GUESS_UNWINDER
+ bool "Guess unwinder"
+ depends on EXPERT
+ ---help---
+ This option enables the "guess" unwinder for unwinding kernel stack
+ traces. It scans the stack and reports every kernel text address it
+ finds. Some of the addresses it reports may be incorrect.
+
+ While this option often produces false positives, it can still be
+ useful in many cases. Unlike the other unwinders, it has no runtime
+ overhead.
+
+endchoice
+
+config FRAME_POINTER
+ depends on !ORC_UNWINDER && !GUESS_UNWINDER
+ bool
+
endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1e902f926be3..6276572259c8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -14,9 +14,11 @@ endif
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
- cc_stack_align_opt := -mpreferred-stack-boundary
-else ifneq ($(call cc-option, -mstack-alignment=4),)
- cc_stack_align_opt := -mstack-alignment
+ cc_stack_align4 := -mpreferred-stack-boundary=2
+ cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+ cc_stack_align4 := -mstack-alignment=4
+ cc_stack_align8 := -mstack-alignment=8
endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
@@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
# Align the stack to the register width instead of using the default
# alignment of 16 bytes. This reduces stack usage and the number of
# alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots:
@@ -115,7 +117,7 @@ else
# default alignment which keep the stack *mis*aligned.
# Furthermore an alignment to the register width reduces stack usage
# and the number of alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
@@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
-
archscripts: scripts_basic
$(Q)$(MAKE) $(build)=arch/x86/tools relocs
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..926c2cc4facc 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params,
m |= (u64)efi->efi_memmap_hi << 32;
#endif
- d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
+ d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
switch (d->type) {
case EFI_RESERVED_TYPE:
case EFI_RUNTIME_SERVICES_CODE:
@@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
- desc->limit = 0xf;
+ desc->limit1 = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
@@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
- desc->limit = 0xf;
+ desc->limit1 = 0xf;
desc->avl = 0;
if (IS_ENABLED(CONFIG_X86_64)) {
desc->l = 1;
@@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
- desc->limit = 0xf;
+ desc->limit1 = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
@@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = 0;
desc->dpl = 0;
desc->p = 1;
- desc->limit = 0x0;
+ desc->limit1 = 0x0;
desc->avl = 0;
desc->l = 0;
desc->d = 0;
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d85b9625e836..11c68cf53d4e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -61,71 +61,6 @@
__HEAD
ENTRY(startup_32)
-#ifdef CONFIG_EFI_STUB
- jmp preferred_addr
-
- /*
- * We don't need the return address, so set up the stack so
- * efi_main() can find its arguments.
- */
-ENTRY(efi_pe_entry)
- add $0x4, %esp
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- popl %ecx
- movl %ecx, efi32_config(%esi) /* Handle */
- popl %ecx
- movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-
- call make_boot_params
- cmpl $0, %eax
- je fail
- movl %esi, BP_code32_start(%eax)
- popl %ecx
- pushl %eax
- pushl %ecx
- jmp 2f /* Skip efi_config initialization */
-
-ENTRY(efi32_stub_entry)
- add $0x4, %esp
- popl %ecx
- popl %edx
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- movl %ecx, efi32_config(%esi) /* Handle */
- movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-2:
- call efi_main
- cmpl $0, %eax
- movl %eax, %esi
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
- movl BP_code32_start(%esi), %eax
- leal preferred_addr(%eax), %eax
- jmp *%eax
-
-preferred_addr:
-#endif
cld
/*
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -208,6 +143,70 @@ preferred_addr:
jmp *%eax
ENDPROC(startup_32)
+#ifdef CONFIG_EFI_STUB
+/*
+ * We don't need the return address, so set up the stack so efi_main() can find
+ * its arguments.
+ */
+ENTRY(efi_pe_entry)
+ add $0x4, %esp
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ popl %ecx
+ movl %ecx, efi32_config(%esi) /* Handle */
+ popl %ecx
+ movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+
+ call make_boot_params
+ cmpl $0, %eax
+ je fail
+ movl %esi, BP_code32_start(%eax)
+ popl %ecx
+ pushl %eax
+ pushl %ecx
+ jmp 2f /* Skip efi_config initialization */
+ENDPROC(efi_pe_entry)
+
+ENTRY(efi32_stub_entry)
+ add $0x4, %esp
+ popl %ecx
+ popl %edx
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ movl %ecx, efi32_config(%esi) /* Handle */
+ movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+2:
+ call efi_main
+ cmpl $0, %eax
+ movl %eax, %esi
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leal startup_32(%eax), %eax
+ jmp *%eax
+ENDPROC(efi32_stub_entry)
+#endif
+
.text
relocated:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fbf4c32d0b62..b4a5d284391c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -243,65 +243,6 @@ ENTRY(startup_64)
* that maps our entire kernel(text+data+bss+brk), zero page
* and command line.
*/
-#ifdef CONFIG_EFI_STUB
- /*
- * The entry point for the PE/COFF executable is efi_pe_entry, so
- * only legacy boot loaders will execute this jmp.
- */
- jmp preferred_addr
-
-ENTRY(efi_pe_entry)
- movq %rcx, efi64_config(%rip) /* Handle */
- movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
-
- leaq efi64_config(%rip), %rax
- movq %rax, efi_config(%rip)
-
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- addq %rbp, efi64_config+40(%rip)
-
- movq %rax, %rdi
- call make_boot_params
- cmpq $0,%rax
- je fail
- mov %rax, %rsi
- leaq startup_32(%rip), %rax
- movl %eax, BP_code32_start(%rsi)
- jmp 2f /* Skip the relocation */
-
-handover_entry:
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- movq efi_config(%rip), %rax
- addq %rbp, 40(%rax)
-2:
- movq efi_config(%rip), %rdi
- call efi_main
- movq %rax,%rsi
- cmpq $0,%rax
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
- movl BP_code32_start(%esi), %eax
- leaq preferred_addr(%rax), %rax
- jmp *%rax
-
-preferred_addr:
-#endif
/* Setup data segments. */
xorl %eax, %eax
@@ -413,6 +354,59 @@ lvl5:
jmp *%rax
#ifdef CONFIG_EFI_STUB
+
+/* The entry point for the PE/COFF executable is efi_pe_entry. */
+ENTRY(efi_pe_entry)
+ movq %rcx, efi64_config(%rip) /* Handle */
+ movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ addq %rbp, efi64_config+40(%rip)
+
+ movq %rax, %rdi
+ call make_boot_params
+ cmpq $0,%rax
+ je fail
+ mov %rax, %rsi
+ leaq startup_32(%rip), %rax
+ movl %eax, BP_code32_start(%rsi)
+ jmp 2f /* Skip the relocation */
+
+handover_entry:
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ movq efi_config(%rip), %rax
+ addq %rbp, 40(%rax)
+2:
+ movq efi_config(%rip), %rdi
+ call efi_main
+ movq %rax,%rsi
+ cmpq $0,%rax
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leaq startup_64(%rax), %rax
+ jmp *%rax
+ENDPROC(efi_pe_entry)
+
.org 0x390
ENTRY(efi64_stub_entry)
movq %rdi, efi64_config(%rip) /* Handle */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 99c7194f7ea6..17818ba6906f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -37,7 +37,9 @@
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
+#include <linux/efi.h>
#include <generated/utsrelease.h>
+#include <asm/efi.h>
/* Macros used by the included decompressor code below. */
#define STATIC
@@ -558,6 +560,87 @@ static void process_mem_region(struct mem_vector *entry,
}
}
+#ifdef CONFIG_EFI
+/*
+ * Returns true if mirror region found (and must have been processed
+ * for slots adding)
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct efi_info *e = &boot_params->efi_info;
+ bool efi_mirror_found = false;
+ struct mem_vector region;
+ efi_memory_desc_t *md;
+ unsigned long pmap;
+ char *signature;
+ u32 nr_desc;
+ int i;
+
+ signature = (char *)&e->efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+ return false;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+ return false;
+ }
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
+
+ nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ efi_mirror_found = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+ /*
+ * Here we are more conservative in picking free memory than
+ * the EFI spec allows:
+ *
+ * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
+ * free memory and thus available to place the kernel image into,
+ * but in practice there's firmware where using that memory leads
+ * to crashes.
+ *
+ * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
+ */
+ if (md->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ if (efi_mirror_found &&
+ !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+ continue;
+
+ region.start = md->phys_addr;
+ region.size = md->num_pages << EFI_PAGE_SHIFT;
+ process_mem_region(&region, minimum, image_size);
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+ break;
+ }
+ }
+ return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ return false;
+}
+#endif
+
static void process_e820_entries(unsigned long minimum,
unsigned long image_size)
{
@@ -586,13 +669,16 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
{
/* Check if we had too many memmaps. */
if (memmap_too_large) {
- debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
+ debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
return 0;
}
/* Make sure minimum is aligned. */
minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+ if (process_efi_entries(minimum, image_size))
+ return slots_fetch_random();
+
process_e820_entries(minimum, image_size);
return slots_fetch_random();
}
@@ -652,7 +738,7 @@ void choose_random_location(unsigned long input,
*/
min_addr = min(*output, 512UL << 20);
- /* Walk e820 and find a random address. */
+ /* Walk available memory entries to find a random address. */
random_addr = find_random_phys_addr(min_addr, output_size);
if (!random_addr) {
warn("Physical KASLR disabled: no suitable memory region!");
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a0838ab929f2..c14217cd0155 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,8 +116,7 @@ void __putstr(const char *s)
}
}
- if (boot_params->screen_info.orig_video_mode == 0 &&
- lines == 0 && cols == 0)
+ if (lines == 0 || cols == 0)
return;
x = boot_params->screen_info.orig_x;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2ed8f0c25def..1bb08ecffd24 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
# the description in lib/decompressor_xxx.c for specific information.
#
# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
-#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128)
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536)
#if ZO_z_output_len > ZO_z_input_len
# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
ZO_z_input_len)
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4b429df40d7a..550cd5012b73 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,3 +1,5 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
+CONFIG_GUESS_UNWINDER=y
+# CONFIG_FRAME_POINTER_UNWINDER is not set
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 4a55cdcdc008..5c15d6b57329 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
unsigned int nbytes = walk->nbytes;
aesni_enc(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 17c05531dfd1..f9eca34301e2 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
unsigned int nbytes = walk->nbytes;
blowfish_enc_blk(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
crypto_inc(ctrblk, BF_BLOCK_SIZE);
}
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 8648158f3916..dbea6020ffe7 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;
__cast5_encrypt(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
}
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index d6fc59aaaadf..30c0a37f4882 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
unsigned int nbytes = walk->nbytes;
des3_ede_enc_blk(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
}
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
/*
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %rdx, 12*8+\offset(%rsp)
movq %rsi, 13*8+\offset(%rsp)
movq %rdi, 14*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro SAVE_C_REGS offset=0
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %r12, 3*8+\offset(%rsp)
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset
.endm
.macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
movq 3*8+\offset(%rsp), %r12
movq 4*8+\offset(%rsp), %rbp
movq 5*8+\offset(%rsp), %rbx
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
movq 13*8(%rsp), %rsi
movq 14*8(%rsp), %rdi
+ UNWIND_HINT_IRET_REGS offset=16*8
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ addr_limit_user_check();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 48ef7bb32c42..8a13d468635a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -673,16 +673,8 @@ ENTRY(name) \
jmp ret_from_intr; \
ENDPROC(name)
-
-#ifdef CONFIG_TRACING
-# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-# define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
#define BUILD_INTERRUPT(name, nr) \
BUILD_INTERRUPT3(name, nr, smp_##name); \
- TRACE_BUILD_INTERRUPT(name, nr)
/* The include is where all of the SMP etc. interrupts come from */
#include <asm/entry_arch.h>
@@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback)
ENDPROC(xen_failsafe_callback)
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
+ xen_evtchn_do_upcall)
#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
+ hyperv_vector_handler)
#endif /* CONFIG_HYPERV */
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
- ASM_CLAC
- pushl $trace_do_page_fault
- jmp common_exception
-END(trace_page_fault)
-#endif
-
ENTRY(page_fault)
ASM_CLAC
pushl $do_page_fault
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6d078b89a5e8..49167258d587 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
+#include <asm/frame.h>
#include <linux/err.h>
.code64
@@ -43,9 +44,10 @@
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
+ UNWIND_HINT_EMPTY
swapgs
sysretq
-ENDPROC(native_usergs_sysret64)
+END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
.macro TRACE_IRQS_IRETQ
@@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64)
*/
ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
+ swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+ UNWIND_HINT_REGS extra=0
/*
* If we need to do entry work or if we guess we'll need to do
@@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath:
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
1:
@@ -316,6 +316,7 @@ syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
opportunistic_sysret_failed:
@@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
popq %rax
+ UNWIND_HINT_REGS extra=0
jmp entry_SYSCALL64_slow_path
1:
@@ -351,6 +353,7 @@ END(stub_ptregs_64)
.macro ptregs_stub func
ENTRY(ptregs_\func)
+ UNWIND_HINT_FUNC
leaq \func(%rip), %rax
jmp stub_ptregs_64
END(ptregs_\func)
@@ -367,6 +370,7 @@ END(ptregs_\func)
* %rsi: next task
*/
ENTRY(__switch_to_asm)
+ UNWIND_HINT_FUNC
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -406,6 +410,7 @@ END(__switch_to_asm)
* r12: kernel thread arg
*/
ENTRY(ret_from_fork)
+ UNWIND_HINT_EMPTY
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -413,6 +418,7 @@ ENTRY(ret_from_fork)
jnz 1f /* kernel threads are uncommon */
2:
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
@@ -440,13 +446,102 @@ END(ret_from_fork)
ENTRY(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
pushq $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
jmp common_interrupt
.align 8
+ vector=vector+1
.endr
END(irq_entries_start)
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+ pushfq
+ testl $X86_EFLAGS_IF, (%rsp)
+ jz .Lokay_\@
+ ud2
+.Lokay_\@:
+ addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK regs=1 old_rsp
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ movq %rsp, \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS base=\old_rsp
+ .endif
+
+ incl PER_CPU_VAR(irq_count)
+ jnz .Lirq_stack_push_old_rsp_\@
+
+ /*
+ * Right now, if we just incremented irq_count to zero, we've
+ * claimed the IRQ stack but we haven't switched to it yet.
+ *
+ * If anything is added that can interrupt us here without using IST,
+ * it must be *extremely* careful to limit its stack usage. This
+ * could include kprobes and a hypothetical future IST-less #DB
+ * handler.
+ *
+ * The OOPS unwinder relies on the word at the top of the IRQ
+ * stack linking back to the previous RSP for the entire time we're
+ * on the IRQ stack. For this to work reliably, we need to write
+ * it before we actually move ourselves to the IRQ stack.
+ */
+
+ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+ movq PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * If the first movq above becomes wrong due to IRQ stack layout
+ * changes, the only way we'll notice is if we try to unwind right
+ * here. Assert that we set up the stack right to catch this type
+ * of bug quickly.
+ */
+ cmpq -8(%rsp), \old_rsp
+ je .Lirq_stack_okay\@
+ ud2
+ .Lirq_stack_okay\@:
+#endif
+
+.Lirq_stack_push_old_rsp_\@:
+ pushq \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS indirect=1
+ .endif
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK regs=1
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ /* We need to be off the IRQ stack before decrementing irq_count. */
+ popq %rsp
+
+ .if \regs
+ UNWIND_HINT_REGS
+ .endif
+
+ /*
+ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+ * the irq stack but we're not on it.
+ */
+
+ decl PER_CPU_VAR(irq_count)
+.endm
+
/*
* Interrupt entry/exit.
*
@@ -485,17 +580,7 @@ END(irq_entries_start)
CALL_enter_from_user_mode
1:
- /*
- * Save previous stack pointer, optionally switch to interrupt stack.
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
- movq %rsp, %rdi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rdi
+ ENTER_IRQ_STACK old_rsp=%rdi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -515,10 +600,8 @@ common_interrupt:
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
- /* Restore saved previous stack */
- popq %rsp
+ LEAVE_IRQ_STACK
testb $3, CS(%rsp)
jz retint_kernel
@@ -561,6 +644,7 @@ restore_c_regs_and_iret:
INTERRUPT_RETURN
ENTRY(native_iret)
+ UNWIND_HINT_IRET_REGS
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
@@ -633,6 +717,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
/*
* At this point, we cannot write to the stack any more, but we can
@@ -654,6 +739,7 @@ END(common_interrupt)
*/
.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS
ASM_CLAC
pushq $~(\num)
.Lcommon_\sym:
@@ -662,31 +748,13 @@ ENTRY(\sym)
END(\sym)
.endm
-#ifdef CONFIG_TRACING
-#define trace(sym) trace_##sym
-#define smp_trace(sym) smp_trace_##sym
-
-.macro trace_apicinterrupt num sym
-apicinterrupt3 \num trace(\sym) smp_trace(\sym)
-.endm
-#else
-.macro trace_apicinterrupt num sym do_sym
-.endm
-#endif
-
/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY .popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
.macro apicinterrupt num sym do_sym
PUSH_SECTION_IRQENTRY
apicinterrupt3 \num \sym \do_sym
-trace_apicinterrupt \num \sym
POP_SECTION_IRQENTRY
.endm
@@ -740,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS offset=8
+
/* Sanity check */
.if \shift_ist != -1 && \paranoid == 0
.error "using shift_ist requires paranoid=1"
.endif
ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
.ifeq \has_error_code
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -763,6 +832,7 @@ ENTRY(\sym)
.else
call error_entry
.endif
+ UNWIND_HINT_REGS
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
.if \paranoid
@@ -829,17 +899,6 @@ ENTRY(\sym)
END(\sym)
.endm
-#ifdef CONFIG_TRACING
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#else
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#endif
-
idtentry divide_error do_divide_error has_error_code=0
idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
@@ -860,6 +919,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
* edi: new selector
*/
ENTRY(native_load_gs_index)
+ FRAME_BEGIN
pushfq
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
@@ -868,8 +928,9 @@ ENTRY(native_load_gs_index)
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
SWAPGS
popfq
+ FRAME_END
ret
-END(native_load_gs_index)
+ENDPROC(native_load_gs_index)
EXPORT_SYMBOL(native_load_gs_index)
_ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -892,17 +953,15 @@ bad_gs:
ENTRY(do_softirq_own_stack)
pushq %rbp
mov %rsp, %rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr), %rsp
- push %rbp /* frame pointer backlink */
+ ENTER_IRQ_STACK regs=0 old_rsp=%r11
call __do_softirq
+ LEAVE_IRQ_STACK regs=0
leaveq
- decl PER_CPU_VAR(irq_count)
ret
-END(do_softirq_own_stack)
+ENDPROC(do_softirq_own_stack)
#ifdef CONFIG_XEN
-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -923,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
* see the correct pointer to the pt_regs
*/
+ UNWIND_HINT_FUNC
movq %rdi, %rsp /* we don't return, adjust the stack frame */
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp, %rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rbp /* frame pointer backlink */
+ UNWIND_HINT_REGS
+
+ ENTER_IRQ_STACK old_rsp=%r10
call xen_evtchn_do_upcall
- popq %rsp
- decl PER_CPU_VAR(irq_count)
+ LEAVE_IRQ_STACK
+
#ifndef CONFIG_PREEMPT
call xen_maybe_preempt_hcall
#endif
@@ -951,6 +1010,7 @@ END(xen_do_hypervisor_callback)
* with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
+ UNWIND_HINT_EMPTY
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
jne 1f
@@ -968,13 +1028,13 @@ ENTRY(xen_failsafe_callback)
movq 8(%rsp), %r11
addq $0x30, %rsp
pushq $0 /* RIP */
- pushq %r11
- pushq %rcx
+ UNWIND_HINT_IRET_REGS offset=8
jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
addq $0x30, %rsp
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
@@ -998,13 +1058,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xendebug do_debug has_error_code=0
+idtentry xenint3 do_int3 has_error_code=0
#endif
idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
idtentry async_page_fault do_async_page_fault has_error_code=1
@@ -1020,6 +1079,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1047,6 +1107,7 @@ END(paranoid_entry)
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
@@ -1068,6 +1129,7 @@ END(paranoid_exit)
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1152,6 +1214,7 @@ END(error_entry)
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
ENTRY(error_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
testl %ebx, %ebx
@@ -1160,19 +1223,9 @@ ENTRY(error_exit)
END(error_exit)
/* Runs on exception stack */
+/* XXX: broken on Xen PV */
ENTRY(nmi)
- /*
- * Fix up the exception frame if we're on Xen.
- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
- * one value to the stack on native, so it may clobber the rdx
- * scratch slot, but it won't clobber any of the important
- * slots past it.
- *
- * Xen is a different story, because the Xen frame itself overlaps
- * the "NMI executing" variable.
- */
- PARAVIRT_ADJUST_EXCEPTION_FRAME
-
+ UNWIND_HINT_IRET_REGS
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
@@ -1234,11 +1287,13 @@ ENTRY(nmi)
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
pushq 3*8(%rdx) /* pt_regs->flags */
pushq 2*8(%rdx) /* pt_regs->cs */
pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -1255,6 +1310,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
ENCODE_FRAME_POINTER
/*
@@ -1409,6 +1465,7 @@ first_nmi:
.rept 5
pushq 11*8(%rsp)
.endr
+ UNWIND_HINT_IRET_REGS
/* Everything up to here is safe from nested NMIs */
@@ -1424,6 +1481,7 @@ first_nmi:
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
INTERRUPT_RETURN /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
1:
#endif
@@ -1473,6 +1531,7 @@ end_repeat_nmi:
* exceptions might do.
*/
call paranoid_entry
+ UNWIND_HINT_REGS
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
@@ -1510,17 +1569,19 @@ nmi_restore:
END(nmi)
ENTRY(ignore_sysret)
+ UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
ENTRY(rewind_stack_do_exit)
+ UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+ leaq -PTREGS_SIZE(%rax), %rsp
+ UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
call do_exit
-1: jmp 1b
END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..e26c25ca7756 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
*/
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ swapgs
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
*/
- PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
@@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat)
jmp restore_regs_and_iret
END(entry_INT80_compat)
- ALIGN
-GLOBAL(stub32_clone)
+ENTRY(stub32_clone)
/*
* The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
* The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -353,3 +350,4 @@ GLOBAL(stub32_clone)
*/
xchg %r8, %rcx
jmp sys_clone
+ENDPROC(stub32_clone)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 726355ce8497..1911310959f8 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg)
* and 8 bits for the node)
*/
d.limit0 = cpu | ((node & 0xf) << 12);
- d.limit = node >> 4;
+ d.limit1 = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3; /* Visible to user code */
d.s = 1; /* Not a system segment */
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index ad44af0dd667..f5cbbba99283 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
if (amd_uncore_llc) {
unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared;
+ unsigned int nshared, subleaf, prev_eax = 0;
uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
- nshared = ((eax >> 14) & 0xfff) + 1;
+ /*
+ * Iterate over Cache Topology Definition leaves until no
+ * more cache descriptions are available.
+ */
+ for (subleaf = 0; subleaf < 5; subleaf++) {
+ cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
+
+ /* EAX[0:4] gives type of cache */
+ if (!(eax & 0x1f))
+ break;
+
+ prev_eax = eax;
+ }
+ nshared = ((prev_eax >> 14) & 0xfff) + 1;
+
uncore->id = apicid - (apicid % nshared);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
@@ -555,7 +568,7 @@ static int __init amd_uncore_init(void)
ret = 0;
}
- if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
amd_uncore_llc = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_llc) {
ret = -ENOMEM;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index af12e294caed..80534d3c2480 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event)
return m == b;
}
-int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_max_precise(void)
{
- if (event->attr.precise_ip) {
- int precise = 0;
+ int precise = 0;
- /* Support for constant skid */
- if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ /* Support for constant skid */
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
- precise++;
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
+ }
+ return precise;
+}
- if (x86_pmu.pebs_prec_dist)
- precise++;
- }
+int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = x86_pmu_max_precise();
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
}
static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
static int __init init_hw_perf_events(void)
{
@@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+ if (x86_pmu.caps_attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_caps_group.attrs = tmp;
+ }
+
if (x86_pmu.event_attrs)
x86_pmu_events_group.attrs = x86_pmu.event_attrs;
@@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = {
.attrs = x86_pmu_attrs,
};
+static ssize_t max_precise_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+ &dev_attr_max_precise.attr,
+ NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+ .name = "caps",
+ .attrs = x86_pmu_caps_attrs,
+};
+
static const struct attribute_group *x86_pmu_attr_groups[] = {
&x86_pmu_attr_group,
&x86_pmu_format_group,
&x86_pmu_events_group,
+ &x86_pmu_caps_group,
NULL,
};
@@ -2335,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
- if (idx > LDT_ENTRIES)
- return 0;
-
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->nr_entries)
+ if (!ldt || idx >= ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
@@ -2348,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;
#endif
} else {
- if (idx > GDT_ENTRIES)
+ if (idx >= GDT_ENTRIES)
return 0;
desc = raw_cpu_ptr(gdt_page.gdt) + idx;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index ddd8d3516bfc..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
- event->hw.itrace_started = 1;
+ perf_event_itrace_started(event);
event->hw.state = 0;
__bts_event_start(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 98b0f0729527..829e89cfcee2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_any.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
+ NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
&format_attr_in_tx.attr,
&format_attr_in_tx_cp.attr,
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
- &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
- &format_attr_ldlat.attr, /* PEBS load latency */
- NULL,
+static struct attribute *nhm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ NULL
};
static struct attribute *skl_format_attr[] = {
@@ -3781,6 +3795,36 @@ done:
static DEVICE_ATTR_RW(freeze_on_smi);
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+ &dev_attr_pmu_name.attr,
+ NULL
+};
+
static struct attribute *intel_pmu_attrs[] = {
&dev_attr_freeze_on_smi.attr,
NULL,
@@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void)
unsigned int unused;
struct extra_reg *er;
int version, i;
+ struct attribute **extra_attr = NULL;
+ char *name;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void)
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_CORE_YONAH:
pr_cont("Core events, ");
+ name = "core";
break;
case INTEL_FAM6_CORE2_MEROM:
@@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_core2_event_constraints;
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
pr_cont("Core2 events, ");
+ name = "core2";
break;
case INTEL_FAM6_NEHALEM:
@@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void)
intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;
+ extra_attr = nhm_format_attr;
pr_cont("Nehalem events, ");
+ name = "nehalem";
break;
case INTEL_FAM6_ATOM_PINEVIEW:
@@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
+ name = "bonnell";
break;
case INTEL_FAM6_ATOM_SILVERMONT1:
@@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_slm_extra_regs;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = slm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Silvermont events, ");
+ name = "silvermont";
break;
case INTEL_FAM6_ATOM_GOLDMONT:
@@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = glm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Goldmont events, ");
+ name = "goldmont";
break;
case INTEL_FAM6_ATOM_GEMINI_LAKE:
@@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = glm_events_attrs;
/* Goldmont Plus has 4-wide pipeline */
event_attr_td_total_slots_scale_glm.event_str = "4";
+ extra_attr = slm_format_attr;
pr_cont("Goldmont plus events, ");
+ name = "goldmont_plus";
break;
case INTEL_FAM6_WESTMERE:
@@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void)
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
intel_pmu_pebs_data_source_nhm();
+ extra_attr = nhm_format_attr;
pr_cont("Westmere events, ");
+ name = "westmere";
break;
case INTEL_FAM6_SANDYBRIDGE:
@@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("SandyBridge events, ");
+ name = "sandybridge";
break;
case INTEL_FAM6_IVYBRIDGE:
@@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("IvyBridge events, ");
+ name = "ivybridge";
break;
@@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.lbr_double_abort = true;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Haswell events, ");
+ name = "haswell";
break;
case INTEL_FAM6_BROADWELL_CORE:
@@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.limit_period = bdw_limit_period;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Broadwell events, ");
+ name = "broadwell";
break;
case INTEL_FAM6_XEON_PHI_KNL:
@@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void)
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
+ extra_attr = slm_format_attr;
pr_cont("Knights Landing/Mill events, ");
+ name = "knights-landing";
break;
case INTEL_FAM6_SKYLAKE_MOBILE:
@@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
- skl_format_attr);
- WARN_ON(!x86_pmu.format_attrs);
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_attr = merge_attr(extra_attr, skl_format_attr);
x86_pmu.cpu_events = hsw_events_attrs;
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
+ name = "skylake";
break;
default:
@@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void)
case 1:
x86_pmu.event_constraints = intel_v1_event_constraints;
pr_cont("generic architected perfmon v1, ");
+ name = "generic_arch_v1";
break;
default:
/*
@@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void)
*/
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v2+";
break;
}
}
+ snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+ if (version >= 2 && extra_attr) {
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ extra_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ }
+
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
- if (x86_pmu.lbr_nr)
+ x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+ if (x86_pmu.lbr_nr) {
+ x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+ }
+
/*
* Access extra MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support offcore event
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel_rdt_common.h>
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR 0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
-
-#define MBM_CNTR_WIDTH 24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME 1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes #bytes since we began monitoring
- * @prev_msr previous value of MSR
- */
-struct sample {
- u64 total_bytes;
- u64 prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */
-static struct sample *mbm_local;
-
-#define pkg_id topology_physical_package_id(smp_processor_id())
-/*
- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
- * rmids per socket, an example is given below
- * RMID1 of Socket0: vrmid = 1
- * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
- * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
- */
-#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR (1ULL << 63)
-#define RMID_VAL_UNAVAIL (1ULL << 62)
-
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID 0x01
-#define QOS_MBM_TOTAL_EVENT_ID 0x02
-#define QOS_MBM_LOCAL_EVENT_ID 0x03
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID (-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
- if (!rmid || rmid == INVALID_RMID)
- return false;
-
- return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
- u64 val;
-
- /*
- * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
- * it just says that to increase confusion.
- */
- wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
- rdmsrl(MSR_IA32_QM_CTR, val);
-
- /*
- * Aside from the ERROR and UNAVAIL bits, assume this thing returns
- * the number of cachelines tagged with @rmid.
- */
- return val;
-}
-
-enum rmid_recycle_state {
- RMID_YOUNG = 0,
- RMID_AVAILABLE,
- RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
- u32 rmid;
- enum rmid_recycle_state state;
- struct list_head list;
- unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
- struct cqm_rmid_entry *entry;
-
- entry = cqm_rmid_ptrs[rmid];
- WARN_ON(entry->rmid != rmid);
-
- return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
- struct cqm_rmid_entry *entry;
-
- lockdep_assert_held(&cache_mutex);
-
- if (list_empty(&cqm_rmid_free_lru))
- return INVALID_RMID;
-
- entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
- list_del(&entry->list);
-
- return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
- struct cqm_rmid_entry *entry;
-
- lockdep_assert_held(&cache_mutex);
-
- WARN_ON(!__rmid_valid(rmid));
- entry = __rmid_entry(rmid);
-
- entry->queue_time = jiffies;
- entry->state = RMID_YOUNG;
-
- list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static void cqm_cleanup(void)
-{
- int i;
-
- if (!cqm_rmid_ptrs)
- return;
-
- for (i = 0; i < cqm_max_rmid; i++)
- kfree(cqm_rmid_ptrs[i]);
-
- kfree(cqm_rmid_ptrs);
- cqm_rmid_ptrs = NULL;
- cqm_enabled = false;
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
- struct cqm_rmid_entry *entry;
- unsigned int nr_rmids;
- int r = 0;
-
- nr_rmids = cqm_max_rmid + 1;
- cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
- nr_rmids, GFP_KERNEL);
- if (!cqm_rmid_ptrs)
- return -ENOMEM;
-
- for (; r <= cqm_max_rmid; r++) {
- struct cqm_rmid_entry *entry;
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto fail;
-
- INIT_LIST_HEAD(&entry->list);
- entry->rmid = r;
- cqm_rmid_ptrs[r] = entry;
-
- list_add_tail(&entry->list, &cqm_rmid_free_lru);
- }
-
- /*
- * RMID 0 is special and is always allocated. It's used for all
- * tasks that are not monitored.
- */
- entry = __rmid_entry(0);
- list_del(&entry->list);
-
- mutex_lock(&cache_mutex);
- intel_cqm_rotation_rmid = __get_rmid();
- mutex_unlock(&cache_mutex);
-
- return 0;
-
-fail:
- cqm_cleanup();
- return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
- /* Per-cpu and task events don't mix */
- if ((a->attach_state & PERF_ATTACH_TASK) !=
- (b->attach_state & PERF_ATTACH_TASK))
- return false;
-
-#ifdef CONFIG_CGROUP_PERF
- if (a->cgrp != b->cgrp)
- return false;
-#endif
-
- /* If not task event, we're machine wide */
- if (!(b->attach_state & PERF_ATTACH_TASK))
- return true;
-
- /*
- * Events that target same task are placed into the same cache group.
- * Mark it as a multi event group, so that we update ->count
- * for every event rather than just the group leader later.
- */
- if (a->hw.target == b->hw.target) {
- b->hw.is_group_event = true;
- return true;
- }
-
- /*
- * Are we an inherited event?
- */
- if (b->parent == a)
- return true;
-
- return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
- if (event->attach_state & PERF_ATTACH_TASK)
- return perf_cgroup_from_task(event->hw.target, event->ctx);
-
- return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- * PROHIBITS
- * system-wide -> cgroup and task
- * cgroup -> system-wide
- * -> task in cgroup
- * task -> system-wide
- * -> task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
- /*
- * We can have any number of cgroups but only one system-wide
- * event at a time.
- */
- if (a->cgrp && b->cgrp) {
- struct perf_cgroup *ac = a->cgrp;
- struct perf_cgroup *bc = b->cgrp;
-
- /*
- * This condition should have been caught in
- * __match_event() and we should be sharing an RMID.
- */
- WARN_ON_ONCE(ac == bc);
-
- if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
- cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
- return true;
-
- return false;
- }
-
- if (a->cgrp || b->cgrp) {
- struct perf_cgroup *ac, *bc;
-
- /*
- * cgroup and system-wide events are mutually exclusive
- */
- if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
- (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
- return true;
-
- /*
- * Ensure neither event is part of the other's cgroup
- */
- ac = event_to_cgroup(a);
- bc = event_to_cgroup(b);
- if (ac == bc)
- return true;
-
- /*
- * Must have cgroup and non-intersecting task events.
- */
- if (!ac || !bc)
- return false;
-
- /*
- * We have cgroup and task events, and the task belongs
- * to a cgroup. Check for for overlap.
- */
- if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
- cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
- return true;
-
- return false;
- }
-#endif
- /*
- * If one of them is not a task, same story as above with cgroups.
- */
- if (!(a->attach_state & PERF_ATTACH_TASK) ||
- !(b->attach_state & PERF_ATTACH_TASK))
- return true;
-
- /*
- * Must be non-overlapping.
- */
- return false;
-}
-
-struct rmid_read {
- u32 rmid;
- u32 evt_type;
- atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-static void init_mbm_sample(u32 rmid, u32 evt_type);
-static void __intel_mbm_event_count(void *info);
-
-static bool is_cqm_event(int e)
-{
- return (e == QOS_L3_OCCUP_EVENT_ID);
-}
-
-static bool is_mbm_event(int e)
-{
- return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
-}
-
-static void cqm_mask_call(struct rmid_read *rr)
-{
- if (is_mbm_event(rr->evt_type))
- on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
- else
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
-}
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
- struct perf_event *event;
- struct list_head *head = &group->hw.cqm_group_entry;
- u32 old_rmid = group->hw.cqm_rmid;
-
- lockdep_assert_held(&cache_mutex);
-
- /*
- * If our RMID is being deallocated, perform a read now.
- */
- if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
- struct rmid_read rr = {
- .rmid = old_rmid,
- .evt_type = group->attr.config,
- .value = ATOMIC64_INIT(0),
- };
-
- cqm_mask_call(&rr);
- local64_set(&group->count, atomic64_read(&rr.value));
- }
-
- raw_spin_lock_irq(&cache_lock);
-
- group->hw.cqm_rmid = rmid;
- list_for_each_entry(event, head, hw.cqm_group_entry)
- event->hw.cqm_rmid = rmid;
-
- raw_spin_unlock_irq(&cache_lock);
-
- /*
- * If the allocation is for mbm, init the mbm stats.
- * Need to check if each event in the group is mbm event
- * because there could be multiple type of events in the same group.
- */
- if (__rmid_valid(rmid)) {
- event = group;
- if (is_mbm_event(event->attr.config))
- init_mbm_sample(rmid, event->attr.config);
-
- list_for_each_entry(event, head, hw.cqm_group_entry) {
- if (is_mbm_event(event->attr.config))
- init_mbm_sample(rmid, event->attr.config);
- }
- }
-
- return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
- struct cqm_rmid_entry *entry;
-
- list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
- if (entry->state != RMID_AVAILABLE)
- break;
-
- if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
- entry->state = RMID_DIRTY;
- }
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
- struct perf_event *leader, *event;
-
- lockdep_assert_held(&cache_mutex);
-
- leader = list_first_entry(&cache_groups, struct perf_event,
- hw.cqm_groups_entry);
- event = leader;
-
- list_for_each_entry_continue(event, &cache_groups,
- hw.cqm_groups_entry) {
- if (__rmid_valid(event->hw.cqm_rmid))
- continue;
-
- if (__conflict_event(event, leader))
- continue;
-
- intel_cqm_xchg_rmid(event, rmid);
- return true;
- }
-
- return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
- struct cqm_rmid_entry *entry, *tmp;
-
- lockdep_assert_held(&cache_mutex);
-
- *available = 0;
- list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
- unsigned long min_queue_time;
- unsigned long now = jiffies;
-
- /*
- * We hold RMIDs placed into limbo for a minimum queue
- * time. Before the minimum queue time has elapsed we do
- * not recycle RMIDs.
- *
- * The reasoning is that until a sufficient time has
- * passed since we stopped using an RMID, any RMID
- * placed onto the limbo list will likely still have
- * data tagged in the cache, which means we'll probably
- * fail to recycle it anyway.
- *
- * We can save ourselves an expensive IPI by skipping
- * any RMIDs that have not been queued for the minimum
- * time.
- */
- min_queue_time = entry->queue_time +
- msecs_to_jiffies(__rmid_queue_time_ms);
-
- if (time_after(min_queue_time, now))
- break;
-
- entry->state = RMID_AVAILABLE;
- (*available)++;
- }
-
- /*
- * Fast return if none of the RMIDs on the limbo list have been
- * sitting on the queue for the minimum queue time.
- */
- if (!*available)
- return false;
-
- /*
- * Test whether an RMID is free for each package.
- */
- on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
- list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
- /*
- * Exhausted all RMIDs that have waited min queue time.
- */
- if (entry->state == RMID_YOUNG)
- break;
-
- if (entry->state == RMID_DIRTY)
- continue;
-
- list_del(&entry->list); /* remove from limbo */
-
- /*
- * The rotation RMID gets priority if it's
- * currently invalid. In which case, skip adding
- * the RMID to the the free lru.
- */
- if (!__rmid_valid(intel_cqm_rotation_rmid)) {
- intel_cqm_rotation_rmid = entry->rmid;
- continue;
- }
-
- /*
- * If we have groups waiting for RMIDs, hand
- * them one now provided they don't conflict.
- */
- if (intel_cqm_sched_in_event(entry->rmid))
- continue;
-
- /*
- * Otherwise place it onto the free list.
- */
- list_add_tail(&entry->list, &cqm_rmid_free_lru);
- }
-
-
- return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
- struct perf_event *rotor;
- u32 rmid;
-
- lockdep_assert_held(&cache_mutex);
-
- rotor = list_first_entry(&cache_groups, struct perf_event,
- hw.cqm_groups_entry);
-
- /*
- * The group at the front of the list should always have a valid
- * RMID. If it doesn't then no groups have RMIDs assigned and we
- * don't need to rotate the list.
- */
- if (next == rotor)
- return;
-
- rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
- __put_rmid(rmid);
-
- list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
- struct perf_event *group, *g;
- u32 rmid;
-
- lockdep_assert_held(&cache_mutex);
-
- list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
- if (group == event)
- continue;
-
- rmid = group->hw.cqm_rmid;
-
- /*
- * Skip events that don't have a valid RMID.
- */
- if (!__rmid_valid(rmid))
- continue;
-
- /*
- * No conflict? No problem! Leave the event alone.
- */
- if (!__conflict_event(group, event))
- continue;
-
- intel_cqm_xchg_rmid(group, INVALID_RMID);
- __put_rmid(rmid);
- }
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- * 1. To handle the scheduling of conflicting events
- * 2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
- struct perf_event *group, *start = NULL;
- unsigned int threshold_limit;
- unsigned int nr_needed = 0;
- unsigned int nr_available;
- bool rotated = false;
-
- mutex_lock(&cache_mutex);
-
-again:
- /*
- * Fast path through this function if there are no groups and no
- * RMIDs that need cleaning.
- */
- if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
- goto out;
-
- list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
- if (!__rmid_valid(group->hw.cqm_rmid)) {
- if (!start)
- start = group;
- nr_needed++;
- }
- }
-
- /*
- * We have some event groups, but they all have RMIDs assigned
- * and no RMIDs need cleaning.
- */
- if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
- goto out;
-
- if (!nr_needed)
- goto stabilize;
-
- /*
- * We have more event groups without RMIDs than available RMIDs,
- * or we have event groups that conflict with the ones currently
- * scheduled.
- *
- * We force deallocate the rmid of the group at the head of
- * cache_groups. The first event group without an RMID then gets
- * assigned intel_cqm_rotation_rmid. This ensures we always make
- * forward progress.
- *
- * Rotate the cache_groups list so the previous head is now the
- * tail.
- */
- __intel_cqm_pick_and_rotate(start);
-
- /*
- * If the rotation is going to succeed, reduce the threshold so
- * that we don't needlessly reuse dirty RMIDs.
- */
- if (__rmid_valid(intel_cqm_rotation_rmid)) {
- intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
- intel_cqm_rotation_rmid = __get_rmid();
-
- intel_cqm_sched_out_conflicting_events(start);
-
- if (__intel_cqm_threshold)
- __intel_cqm_threshold--;
- }
-
- rotated = true;
-
-stabilize:
- /*
- * We now need to stablize the RMID we freed above (if any) to
- * ensure that the next time we rotate we have an RMID with zero
- * occupancy value.
- *
- * Alternatively, if we didn't need to perform any rotation,
- * we'll have a bunch of RMIDs in limbo that need stabilizing.
- */
- threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
- while (intel_cqm_rmid_stabilize(&nr_available) &&
- __intel_cqm_threshold < threshold_limit) {
- unsigned int steal_limit;
-
- /*
- * Don't spin if nobody is actively waiting for an RMID,
- * the rotation worker will be kicked as soon as an
- * event needs an RMID anyway.
- */
- if (!nr_needed)
- break;
-
- /* Allow max 25% of RMIDs to be in limbo. */
- steal_limit = (cqm_max_rmid + 1) / 4;
-
- /*
- * We failed to stabilize any RMIDs so our rotation
- * logic is now stuck. In order to make forward progress
- * we have a few options:
- *
- * 1. rotate ("steal") another RMID
- * 2. increase the threshold
- * 3. do nothing
- *
- * We do both of 1. and 2. until we hit the steal limit.
- *
- * The steal limit prevents all RMIDs ending up on the
- * limbo list. This can happen if every RMID has a
- * non-zero occupancy above threshold_limit, and the
- * occupancy values aren't dropping fast enough.
- *
- * Note that there is prioritisation at work here - we'd
- * rather increase the number of RMIDs on the limbo list
- * than increase the threshold, because increasing the
- * threshold skews the event data (because we reuse
- * dirty RMIDs) - threshold bumps are a last resort.
- */
- if (nr_available < steal_limit)
- goto again;
-
- __intel_cqm_threshold++;
- }
-
-out:
- mutex_unlock(&cache_mutex);
- return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
- unsigned long delay;
-
- __intel_cqm_rmid_rotate();
-
- delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
- schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
-{
- struct sample *mbm_current;
- u32 vrmid = rmid_2_index(rmid);
- u64 val, bytes, shift;
- u32 eventid;
-
- if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
- mbm_current = &mbm_local[vrmid];
- eventid = QOS_MBM_LOCAL_EVENT_ID;
- } else {
- mbm_current = &mbm_total[vrmid];
- eventid = QOS_MBM_TOTAL_EVENT_ID;
- }
-
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, val);
- if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return mbm_current->total_bytes;
-
- if (first) {
- mbm_current->prev_msr = val;
- mbm_current->total_bytes = 0;
- return mbm_current->total_bytes;
- }
-
- /*
- * The h/w guarantees that counters will not overflow
- * so long as we poll them at least once per second.
- */
- shift = 64 - MBM_CNTR_WIDTH;
- bytes = (val << shift) - (mbm_current->prev_msr << shift);
- bytes >>= shift;
-
- bytes *= cqm_l3_scale;
-
- mbm_current->total_bytes += bytes;
- mbm_current->prev_msr = val;
-
- return mbm_current->total_bytes;
-}
-
-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
-{
- return update_sample(rmid, evt_type, 0);
-}
-
-static void __intel_mbm_event_init(void *info)
-{
- struct rmid_read *rr = info;
-
- update_sample(rr->rmid, rr->evt_type, 1);
-}
-
-static void init_mbm_sample(u32 rmid, u32 evt_type)
-{
- struct rmid_read rr = {
- .rmid = rmid,
- .evt_type = evt_type,
- .value = ATOMIC64_INIT(0),
- };
-
- /* on each socket, init sample */
- on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
- struct perf_event **group)
-{
- struct perf_event *iter;
- bool conflict = false;
- u32 rmid;
-
- event->hw.is_group_event = false;
- list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
- rmid = iter->hw.cqm_rmid;
-
- if (__match_event(iter, event)) {
- /* All tasks in a group share an RMID */
- event->hw.cqm_rmid = rmid;
- *group = iter;
- if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
- init_mbm_sample(rmid, event->attr.config);
- return;
- }
-
- /*
- * We only care about conflicts for events that are
- * actually scheduled in (and hence have a valid RMID).
- */
- if (__conflict_event(iter, event) && __rmid_valid(rmid))
- conflict = true;
- }
-
- if (conflict)
- rmid = INVALID_RMID;
- else
- rmid = __get_rmid();
-
- if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
- init_mbm_sample(rmid, event->attr.config);
-
- event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
- unsigned long flags;
- u32 rmid;
- u64 val;
-
- /*
- * Task events are handled by intel_cqm_event_count().
- */
- if (event->cpu == -1)
- return;
-
- raw_spin_lock_irqsave(&cache_lock, flags);
- rmid = event->hw.cqm_rmid;
-
- if (!__rmid_valid(rmid))
- goto out;
-
- if (is_mbm_event(event->attr.config))
- val = rmid_read_mbm(rmid, event->attr.config);
- else
- val = __rmid_read(rmid);
-
- /*
- * Ignore this reading on error states and do not update the value.
- */
- if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- goto out;
-
- local64_set(&event->count, val);
-out:
- raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
- struct rmid_read *rr = info;
- u64 val;
-
- val = __rmid_read(rr->rmid);
-
- if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return;
-
- atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
- return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static void __intel_mbm_event_count(void *info)
-{
- struct rmid_read *rr = info;
- u64 val;
-
- val = rmid_read_mbm(rr->rmid, rr->evt_type);
- if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return;
- atomic64_add(val, &rr->value);
-}
-
-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
-{
- struct perf_event *iter, *iter1;
- int ret = HRTIMER_RESTART;
- struct list_head *head;
- unsigned long flags;
- u32 grp_rmid;
-
- /*
- * Need to cache_lock as the timer Event Select MSR reads
- * can race with the mbm/cqm count() and mbm_init() reads.
- */
- raw_spin_lock_irqsave(&cache_lock, flags);
-
- if (list_empty(&cache_groups)) {
- ret = HRTIMER_NORESTART;
- goto out;
- }
-
- list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
- grp_rmid = iter->hw.cqm_rmid;
- if (!__rmid_valid(grp_rmid))
- continue;
- if (is_mbm_event(iter->attr.config))
- update_sample(grp_rmid, iter->attr.config, 0);
-
- head = &iter->hw.cqm_group_entry;
- if (list_empty(head))
- continue;
- list_for_each_entry(iter1, head, hw.cqm_group_entry) {
- if (!iter1->hw.is_group_event)
- break;
- if (is_mbm_event(iter1->attr.config))
- update_sample(iter1->hw.cqm_rmid,
- iter1->attr.config, 0);
- }
- }
-
- hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
-out:
- raw_spin_unlock_irqrestore(&cache_lock, flags);
-
- return ret;
-}
-
-static void __mbm_start_timer(void *info)
-{
- hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
- HRTIMER_MODE_REL_PINNED);
-}
-
-static void __mbm_stop_timer(void *info)
-{
- hrtimer_cancel(&mbm_timers[pkg_id]);
-}
-
-static void mbm_start_timers(void)
-{
- on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
-}
-
-static void mbm_stop_timers(void)
-{
- on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
-}
-
-static void mbm_hrtimer_init(void)
-{
- struct hrtimer *hr;
- int i;
-
- for (i = 0; i < mbm_socket_max; i++) {
- hr = &mbm_timers[i];
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hr->function = mbm_hrtimer_handle;
- }
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
- unsigned long flags;
- struct rmid_read rr = {
- .evt_type = event->attr.config,
- .value = ATOMIC64_INIT(0),
- };
-
- /*
- * We only need to worry about task events. System-wide events
- * are handled like usual, i.e. entirely with
- * intel_cqm_event_read().
- */
- if (event->cpu != -1)
- return __perf_event_count(event);
-
- /*
- * Only the group leader gets to report values except in case of
- * multiple events in the same group, we still need to read the
- * other events.This stops us
- * reporting duplicate values to userspace, and gives us a clear
- * rule for which task gets to report the values.
- *
- * Note that it is impossible to attribute these values to
- * specific packages - we forfeit that ability when we create
- * task events.
- */
- if (!cqm_group_leader(event) && !event->hw.is_group_event)
- return 0;
-
- /*
- * Getting up-to-date values requires an SMP IPI which is not
- * possible if we're being called in interrupt context. Return
- * the cached values instead.
- */
- if (unlikely(in_interrupt()))
- goto out;
-
- /*
- * Notice that we don't perform the reading of an RMID
- * atomically, because we can't hold a spin lock across the
- * IPIs.
- *
- * Speculatively perform the read, since @event might be
- * assigned a different (possibly invalid) RMID while we're
- * busying performing the IPI calls. It's therefore necessary to
- * check @event's RMID afterwards, and if it has changed,
- * discard the result of the read.
- */
- rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
- if (!__rmid_valid(rr.rmid))
- goto out;
-
- cqm_mask_call(&rr);
-
- raw_spin_lock_irqsave(&cache_lock, flags);
- if (event->hw.cqm_rmid == rr.rmid)
- local64_set(&event->count, atomic64_read(&rr.value));
- raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
- return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
- struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
- u32 rmid = event->hw.cqm_rmid;
-
- if (!(event->hw.cqm_state & PERF_HES_STOPPED))
- return;
-
- event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
- if (state->rmid_usecnt++) {
- if (!WARN_ON_ONCE(state->rmid != rmid))
- return;
- } else {
- WARN_ON_ONCE(state->rmid);
- }
-
- state->rmid = rmid;
- wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
- struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
- if (event->hw.cqm_state & PERF_HES_STOPPED)
- return;
-
- event->hw.cqm_state |= PERF_HES_STOPPED;
-
- intel_cqm_event_read(event);
-
- if (!--state->rmid_usecnt) {
- state->rmid = 0;
- wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
- } else {
- WARN_ON_ONCE(!state->rmid);
- }
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
- unsigned long flags;
- u32 rmid;
-
- raw_spin_lock_irqsave(&cache_lock, flags);
-
- event->hw.cqm_state = PERF_HES_STOPPED;
- rmid = event->hw.cqm_rmid;
-
- if (__rmid_valid(rmid) && (mode & PERF_EF_START))
- intel_cqm_event_start(event, mode);
-
- raw_spin_unlock_irqrestore(&cache_lock, flags);
-
- return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
- struct perf_event *group_other = NULL;
- unsigned long flags;
-
- mutex_lock(&cache_mutex);
- /*
- * Hold the cache_lock as mbm timer handlers could be
- * scanning the list of events.
- */
- raw_spin_lock_irqsave(&cache_lock, flags);
-
- /*
- * If there's another event in this group...
- */
- if (!list_empty(&event->hw.cqm_group_entry)) {
- group_other = list_first_entry(&event->hw.cqm_group_entry,
- struct perf_event,
- hw.cqm_group_entry);
- list_del(&event->hw.cqm_group_entry);
- }
-
- /*
- * And we're the group leader..
- */
- if (cqm_group_leader(event)) {
- /*
- * If there was a group_other, make that leader, otherwise
- * destroy the group and return the RMID.
- */
- if (group_other) {
- list_replace(&event->hw.cqm_groups_entry,
- &group_other->hw.cqm_groups_entry);
- } else {
- u32 rmid = event->hw.cqm_rmid;
-
- if (__rmid_valid(rmid))
- __put_rmid(rmid);
- list_del(&event->hw.cqm_groups_entry);
- }
- }
-
- raw_spin_unlock_irqrestore(&cache_lock, flags);
-
- /*
- * Stop the mbm overflow timers when the last event is destroyed.
- */
- if (mbm_enabled && list_empty(&cache_groups))
- mbm_stop_timers();
-
- mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
- struct perf_event *group = NULL;
- bool rotate = false;
- unsigned long flags;
-
- if (event->attr.type != intel_cqm_pmu.type)
- return -ENOENT;
-
- if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
- (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
- return -EINVAL;
-
- if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
- (is_mbm_event(event->attr.config) && !mbm_enabled))
- return -EINVAL;
-
- /* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
- return -EINVAL;
-
- INIT_LIST_HEAD(&event->hw.cqm_group_entry);
- INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
- event->destroy = intel_cqm_event_destroy;
-
- mutex_lock(&cache_mutex);
-
- /*
- * Start the mbm overflow timers when the first event is created.
- */
- if (mbm_enabled && list_empty(&cache_groups))
- mbm_start_timers();
-
- /* Will also set rmid */
- intel_cqm_setup_event(event, &group);
-
- /*
- * Hold the cache_lock as mbm timer handlers be
- * scanning the list of events.
- */
- raw_spin_lock_irqsave(&cache_lock, flags);
-
- if (group) {
- list_add_tail(&event->hw.cqm_group_entry,
- &group->hw.cqm_group_entry);
- } else {
- list_add_tail(&event->hw.cqm_groups_entry,
- &cache_groups);
-
- /*
- * All RMIDs are either in use or have recently been
- * used. Kick the rotation worker to clean/free some.
- *
- * We only do this for the group leader, rather than for
- * every event in a group to save on needless work.
- */
- if (!__rmid_valid(event->hw.cqm_rmid))
- rotate = true;
- }
-
- raw_spin_unlock_irqrestore(&cache_lock, flags);
- mutex_unlock(&cache_mutex);
-
- if (rotate)
- schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
- return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
-
-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
-
-static struct attribute *intel_cqm_events_attr[] = {
- EVENT_PTR(intel_cqm_llc),
- EVENT_PTR(intel_cqm_llc_pkg),
- EVENT_PTR(intel_cqm_llc_unit),
- EVENT_PTR(intel_cqm_llc_scale),
- EVENT_PTR(intel_cqm_llc_snapshot),
- NULL,
-};
-
-static struct attribute *intel_mbm_events_attr[] = {
- EVENT_PTR(intel_cqm_total_bytes),
- EVENT_PTR(intel_cqm_local_bytes),
- EVENT_PTR(intel_cqm_total_bytes_pkg),
- EVENT_PTR(intel_cqm_local_bytes_pkg),
- EVENT_PTR(intel_cqm_total_bytes_unit),
- EVENT_PTR(intel_cqm_local_bytes_unit),
- EVENT_PTR(intel_cqm_total_bytes_scale),
- EVENT_PTR(intel_cqm_local_bytes_scale),
- NULL,
-};
-
-static struct attribute *intel_cmt_mbm_events_attr[] = {
- EVENT_PTR(intel_cqm_llc),
- EVENT_PTR(intel_cqm_total_bytes),
- EVENT_PTR(intel_cqm_local_bytes),
- EVENT_PTR(intel_cqm_llc_pkg),
- EVENT_PTR(intel_cqm_total_bytes_pkg),
- EVENT_PTR(intel_cqm_local_bytes_pkg),
- EVENT_PTR(intel_cqm_llc_unit),
- EVENT_PTR(intel_cqm_total_bytes_unit),
- EVENT_PTR(intel_cqm_local_bytes_unit),
- EVENT_PTR(intel_cqm_llc_scale),
- EVENT_PTR(intel_cqm_total_bytes_scale),
- EVENT_PTR(intel_cqm_local_bytes_scale),
- EVENT_PTR(intel_cqm_llc_snapshot),
- NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
- .name = "events",
- .attrs = NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
- .name = "format",
- .attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
- char *page)
-{
- ssize_t rv;
-
- mutex_lock(&cache_mutex);
- rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
- mutex_unlock(&cache_mutex);
-
- return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int bytes, cachelines;
- int ret;
-
- ret = kstrtouint(buf, 0, &bytes);
- if (ret)
- return ret;
-
- mutex_lock(&cache_mutex);
-
- __intel_cqm_max_threshold = bytes;
- cachelines = bytes / cqm_l3_scale;
-
- /*
- * The new maximum takes effect immediately.
- */
- if (__intel_cqm_threshold > cachelines)
- __intel_cqm_threshold = cachelines;
-
- mutex_unlock(&cache_mutex);
-
- return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
- &dev_attr_max_recycle_threshold.attr,
- NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
- .attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
- &intel_cqm_events_group,
- &intel_cqm_format_group,
- &intel_cqm_group,
- NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
- .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
- .attr_groups = intel_cqm_attr_groups,
- .task_ctx_nr = perf_sw_context,
- .event_init = intel_cqm_event_init,
- .add = intel_cqm_event_add,
- .del = intel_cqm_event_stop,
- .start = intel_cqm_event_start,
- .stop = intel_cqm_event_stop,
- .read = intel_cqm_event_read,
- .count = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
- int reader;
-
- /* First online cpu in package becomes the reader */
- reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
- if (reader >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static int intel_cqm_cpu_starting(unsigned int cpu)
-{
- struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- state->rmid = 0;
- state->closid = 0;
- state->rmid_usecnt = 0;
-
- WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
- WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-
- cqm_pick_event_reader(cpu);
- return 0;
-}
-
-static int intel_cqm_cpu_exit(unsigned int cpu)
-{
- int target;
-
- /* Is @cpu the current cqm reader for this package ? */
- if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
- return 0;
-
- /* Find another online reader in this package */
- target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-
- if (target < nr_cpu_ids)
- cpumask_set_cpu(target, &cqm_cpumask);
-
- return 0;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
- { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
- {}
-};
-
-static void mbm_cleanup(void)
-{
- if (!mbm_enabled)
- return;
-
- kfree(mbm_local);
- kfree(mbm_total);
- mbm_enabled = false;
-}
-
-static const struct x86_cpu_id intel_mbm_local_match[] = {
- { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
- {}
-};
-
-static const struct x86_cpu_id intel_mbm_total_match[] = {
- { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
- {}
-};
-
-static int intel_mbm_init(void)
-{
- int ret = 0, array_size, maxid = cqm_max_rmid + 1;
-
- mbm_socket_max = topology_max_packages();
- array_size = sizeof(struct sample) * maxid * mbm_socket_max;
- mbm_local = kmalloc(array_size, GFP_KERNEL);
- if (!mbm_local)
- return -ENOMEM;
-
- mbm_total = kmalloc(array_size, GFP_KERNEL);
- if (!mbm_total) {
- ret = -ENOMEM;
- goto out;
- }
-
- array_size = sizeof(struct hrtimer) * mbm_socket_max;
- mbm_timers = kmalloc(array_size, GFP_KERNEL);
- if (!mbm_timers) {
- ret = -ENOMEM;
- goto out;
- }
- mbm_hrtimer_init();
-
-out:
- if (ret)
- mbm_cleanup();
-
- return ret;
-}
-
-static int __init intel_cqm_init(void)
-{
- char *str = NULL, scale[20];
- int cpu, ret;
-
- if (x86_match_cpu(intel_cqm_match))
- cqm_enabled = true;
-
- if (x86_match_cpu(intel_mbm_local_match) &&
- x86_match_cpu(intel_mbm_total_match))
- mbm_enabled = true;
-
- if (!cqm_enabled && !mbm_enabled)
- return -ENODEV;
-
- cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
- /*
- * It's possible that not all resources support the same number
- * of RMIDs. Instead of making scheduling much more complicated
- * (where we have to match a task's RMID to a cpu that supports
- * that many RMIDs) just find the minimum RMIDs supported across
- * all cpus.
- *
- * Also, check that the scales match on all cpus.
- */
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->x86_cache_max_rmid < cqm_max_rmid)
- cqm_max_rmid = c->x86_cache_max_rmid;
-
- if (c->x86_cache_occ_scale != cqm_l3_scale) {
- pr_err("Multiple LLC scale values, disabling\n");
- ret = -EINVAL;
- goto out;
- }
- }
-
- /*
- * A reasonable upper limit on the max threshold is the number
- * of lines tagged per RMID if all RMIDs have the same number of
- * lines tagged in the LLC.
- *
- * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
- */
- __intel_cqm_max_threshold =
- boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
- snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
- str = kstrdup(scale, GFP_KERNEL);
- if (!str) {
- ret = -ENOMEM;
- goto out;
- }
-
- event_attr_intel_cqm_llc_scale.event_str = str;
-
- ret = intel_cqm_setup_rmid_cache();
- if (ret)
- goto out;
-
- if (mbm_enabled)
- ret = intel_mbm_init();
- if (ret && !cqm_enabled)
- goto out;
-
- if (cqm_enabled && mbm_enabled)
- intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
- else if (!cqm_enabled && mbm_enabled)
- intel_cqm_events_group.attrs = intel_mbm_events_attr;
- else if (cqm_enabled && !mbm_enabled)
- intel_cqm_events_group.attrs = intel_cqm_events_attr;
-
- ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
- if (ret) {
- pr_err("Intel CQM perf registration failed: %d\n", ret);
- goto out;
- }
-
- if (cqm_enabled)
- pr_info("Intel CQM monitoring enabled\n");
- if (mbm_enabled)
- pr_info("Intel MBM enabled\n");
-
- /*
- * Setup the hot cpu notifier once we are sure cqm
- * is enabled to avoid notifier leak.
- */
- cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
- "perf/x86/cqm:starting",
- intel_cqm_cpu_starting, NULL);
- cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
- "perf/x86/cqm:online",
- NULL, intel_cqm_cpu_exit);
-out:
- cpus_read_unlock();
-
- if (ret) {
- kfree(str);
- cqm_cleanup();
- mbm_cleanup();
- }
-
- return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed5f8ed..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
*/
#define P(a, b) PERF_MEM_S(a, b)
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
static u64 pebs_data_source[] = {
- P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
- OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
- OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
- OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
- OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
- OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
- OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
- OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
- OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
- OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
- OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
- OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
};
/* Patch up minor differences in the bits */
void __init intel_pmu_pebs_data_source_nhm(void)
{
- pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
- pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
- pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
}
static u64 precise_store_data(u64 status)
@@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
- int model = boot_cpu_data.x86_model;
- int fam = boot_cpu_data.x86;
dse.val = status;
@@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status)
/*
* Nehalem models do not support TLB, Lock infos
*/
- if (fam == 0x6 && (model == 26 || model == 30
- || model == 31 || model == 46)) {
+ if (x86_pmu.pebs_no_tlb) {
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
@@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
else
regs->flags &= ~PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_ADDR) &&
+ if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 955457a30197..8a6bbacd17dc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
X86_BR_CALL_STACK = 1 << 16,/* call stack */
X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].in_tx = 0;
cpuc->lbr_entries[i].abort = 0;
cpuc->lbr_entries[i].cycles = 0;
+ cpuc->lbr_entries[i].type = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
@@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
cpuc->lbr_entries[out].cycles = cycles;
+ cpuc->lbr_entries[out].type = 0;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
@@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_CALL)
mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
return ret;
}
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_UNKNOWN, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_UNKNOWN, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_UNKNOWN, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
+
/*
* implement actual branch filter based on user demand.
* Hardware may not exactly satisfy that request, thus
@@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
bool compress = false;
/* if sampling all branches, then nothing to filter */
- if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].from = 0;
compress = true;
}
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
}
if (!compress)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
- if (!event->hw.itrace_started) {
- event->hw.itrace_started = 1;
+ /* First round: clear STATUS, in particular the PSB byte counter. */
+ if (!event->hw.config) {
+ perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3a4cab..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
- PERF_SAMPLE_TRANSACTION)
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
/*
* A debug store configuration.
@@ -558,6 +558,7 @@ struct x86_pmu {
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
+ struct attribute **caps_attrs;
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
@@ -591,7 +592,8 @@ struct x86_pmu {
pebs :1,
pebs_active :1,
pebs_broken :1,
- pebs_prec_dist :1;
+ pebs_prec_dist :1,
+ pebs_no_tlb :1;
int pebs_record_size;
int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
@@ -741,6 +743,8 @@ int x86_reserve_hardware(void);
void x86_release_hardware(void);
+int x86_pmu_max_precise(void);
+
void hw_perf_lbr_event_destroy(struct perf_event *event);
int x86_setup_perfctr(struct perf_event *event);
@@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void);
void intel_pmu_pebs_data_source_nhm(void);
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 724153797209..e0bb46c02857 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
if (ksig->ka.sa.sa_flags & SA_ONSTACK)
sp = sigsp(sp, ksig);
/* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER32_DS &&
+ else if (regs->ss != __USER32_DS &&
!(ksig->ka.sa.sa_flags & SA_RESTORER) &&
ksig->ka.sa.sa_restorer)
sp = (unsigned long) ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
# define _ASM_EXTABLE_EX(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
# define _ASM_NOKPROBE(entry) \
.pushsection "_kprobe_blacklist","aw" ; \
_ASM_ALIGN ; \
@@ -123,6 +126,9 @@
# define _ASM_EXTABLE_EX(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
/* For C file, we already have NOKPROBE_SYMBOL macro */
#endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
return xchg(&v->counter, new);
}
-#define ATOMIC_OP(op) \
-static inline void atomic_##op(int i, atomic_t *v) \
-{ \
- asm volatile(LOCK_PREFIX #op"l %1,%0" \
- : "+m" (v->counter) \
- : "ir" (i) \
- : "memory"); \
+static inline void atomic_and(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "andl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int atomic_fetch_and(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
+
+ do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+
+ return val;
}
-#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- int val = atomic_read(v); \
- do { \
- } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \
- return val; \
+static inline void atomic_or(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "orl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
}
-#define ATOMIC_OPS(op, c_op) \
- ATOMIC_OP(op) \
- ATOMIC_FETCH_OP(op, c_op)
+static inline int atomic_fetch_or(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
-ATOMIC_OPS(and, &)
-ATOMIC_OPS(or , |)
-ATOMIC_OPS(xor, ^)
+ do { } while (!atomic_try_cmpxchg(v, &val, val | i));
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
+ return val;
+}
+
+static inline void atomic_xor(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int atomic_fetch_xor(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
+
+ do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
+
+ return val;
+}
/**
* __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
int c = atomic_read(v);
+
do {
if (unlikely(c == u))
break;
} while (!atomic_try_cmpxchg(v, &c, c + a));
+
return c;
}
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
-#define ATOMIC64_OP(op, c_op) \
-static inline void atomic64_##op(long long i, atomic64_t *v) \
-{ \
- long long old, c = 0; \
- while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
- c = old; \
+static inline void atomic64_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
}
-#define ATOMIC64_FETCH_OP(op, c_op) \
-static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \
-{ \
- long long old, c = 0; \
- while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
- c = old; \
- return old; \
+static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
+
+ return old;
}
-ATOMIC64_FETCH_OP(add, +)
+static inline void atomic64_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
-#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
+ while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+}
+
+static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+
+ return old;
+}
-#define ATOMIC64_OPS(op, c_op) \
- ATOMIC64_OP(op, c_op) \
- ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+}
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+
+ return old;
+}
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
+ c = old;
+
+ return old;
+}
+
+#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
}
#define atomic64_try_cmpxchg atomic64_try_cmpxchg
-static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
{
return try_cmpxchg(&v->counter, old, new);
}
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
*/
static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
{
- long c = atomic64_read(v);
+ s64 c = atomic64_read(v);
do {
if (unlikely(c == u))
return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
*/
static inline long atomic64_dec_if_positive(atomic64_t *v)
{
- long dec, c = atomic64_read(v);
+ s64 dec, c = atomic64_read(v);
do {
dec = c - 1;
if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
return dec;
}
-#define ATOMIC64_OP(op) \
-static inline void atomic64_##op(long i, atomic64_t *v) \
-{ \
- asm volatile(LOCK_PREFIX #op"q %1,%0" \
- : "+m" (v->counter) \
- : "er" (i) \
- : "memory"); \
+static inline void atomic64_and(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "andq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
}
-#define ATOMIC64_FETCH_OP(op, c_op) \
-static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
-{ \
- long val = atomic64_read(v); \
- do { \
- } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \
- return val; \
+static inline long atomic64_fetch_and(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
+
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val & i));
+ return val;
}
-#define ATOMIC64_OPS(op, c_op) \
- ATOMIC64_OP(op) \
- ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_or(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "orq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long atomic64_fetch_or(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val | i));
+ return val;
+}
+
+static inline void atomic64_xor(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static inline long atomic64_fetch_xor(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
+
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val ^ i));
+ return val;
+}
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
({ \
bool success; \
- __typeof__(_ptr) _old = (_pold); \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
__typeof__(*(_ptr)) __old = *_old; \
__typeof__(*(_ptr)) __new = (_new); \
switch (size) { \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 66ac08607471..42bbbf0f173d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
/*
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..1a2ba368da39 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -5,6 +5,7 @@
#include <asm/ldt.h>
#include <asm/mmu.h>
#include <asm/fixmap.h>
+#include <asm/irq_vectors.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
desc->s = 1;
desc->dpl = 0x3;
desc->p = info->seg_not_present ^ 1;
- desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->limit1 = (info->limit & 0xf0000) >> 16;
desc->avl = info->useable;
desc->d = info->seg_32bit;
desc->g = info->limit_in_pages;
@@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}
-#ifdef CONFIG_X86_64
-
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
- gate->offset_low = PTR_LOW(func);
+ gate->offset_low = (u16) func;
+ gate->bits.p = 1;
+ gate->bits.dpl = dpl;
+ gate->bits.zero = 0;
+ gate->bits.type = type;
+ gate->offset_middle = (u16) (func >> 16);
+#ifdef CONFIG_X86_64
gate->segment = __KERNEL_CS;
- gate->ist = ist;
- gate->p = 1;
- gate->dpl = dpl;
- gate->zero0 = 0;
- gate->zero1 = 0;
- gate->type = type;
- gate->offset_middle = PTR_MIDDLE(func);
- gate->offset_high = PTR_HIGH(func);
-}
-
+ gate->bits.ist = ist;
+ gate->reserved = 0;
+ gate->offset_high = (u32) (func >> 32);
#else
-static inline void pack_gate(gate_desc *gate, unsigned char type,
- unsigned long base, unsigned dpl, unsigned flags,
- unsigned short seg)
-{
- gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
-}
-
+ gate->segment = seg;
+ gate->bits.ist = 0;
#endif
+}
static inline int desc_empty(const void *ptr)
{
@@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int
memcpy(&gdt[entry], desc, size);
}
-static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
- unsigned long limit, unsigned char type,
- unsigned char flags)
-{
- desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
- desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
- (limit & 0x000f0000) | ((type & 0xff) << 8) |
- ((flags & 0xf) << 20);
- desc->p = 1;
-}
-
-
-static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
{
-#ifdef CONFIG_X86_64
- struct ldttss_desc64 *desc = d;
+ struct ldttss_desc *desc = d;
memset(desc, 0, sizeof(*desc));
- desc->limit0 = size & 0xFFFF;
- desc->base0 = PTR_LOW(addr);
- desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->limit0 = (u16) size;
+ desc->base0 = (u16) addr;
+ desc->base1 = (addr >> 16) & 0xFF;
desc->type = type;
desc->p = 1;
desc->limit1 = (size >> 16) & 0xF;
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
-#else
- pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+ desc->base2 = (addr >> 24) & 0xFF;
+#ifdef CONFIG_X86_64
+ desc->base3 = (u32) (addr >> 32);
#endif
}
@@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
static inline unsigned long get_desc_limit(const struct desc_struct *desc)
{
- return desc->limit0 | (desc->limit << 16);
+ return desc->limit0 | (desc->limit1 << 16);
}
static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
{
desc->limit0 = limit & 0xffff;
- desc->limit = (limit >> 16) & 0xf;
-}
-
-#ifdef CONFIG_X86_64
-static inline void set_nmi_gate(int gate, void *addr)
-{
- gate_desc s;
-
- pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
- write_idt_entry(debug_idt_table, gate, &s);
+ desc->limit1 = (limit >> 16) & 0xf;
}
-#endif
-#ifdef CONFIG_TRACING
-extern struct desc_ptr trace_idt_descr;
-extern gate_desc trace_idt_table[];
-static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
-{
- write_idt_entry(trace_idt_table, entry, gate);
-}
+void update_intr_gate(unsigned int n, const void *addr);
+void alloc_intr_gate(unsigned int n, const void *addr);
-static inline void _trace_set_gate(int gate, unsigned type, void *addr,
- unsigned dpl, unsigned ist, unsigned seg)
-{
- gate_desc s;
-
- pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
- /*
- * does not need to be atomic because it is only done once at
- * setup time
- */
- write_trace_idt_entry(gate, &s);
-}
-#else
-static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
-{
-}
-
-#define _trace_set_gate(gate, type, addr, dpl, ist, seg)
-#endif
-
-static inline void _set_gate(int gate, unsigned type, void *addr,
- unsigned dpl, unsigned ist, unsigned seg)
-{
- gate_desc s;
-
- pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
- /*
- * does not need to be atomic because it is only done once at
- * setup time
- */
- write_idt_entry(idt_table, gate, &s);
- write_trace_idt_entry(gate, &s);
-}
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-#define set_intr_gate_notrace(n, addr) \
- do { \
- BUG_ON((unsigned)n > 0xFF); \
- _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
- __KERNEL_CS); \
- } while (0)
-
-#define set_intr_gate(n, addr) \
- do { \
- set_intr_gate_notrace(n, addr); \
- _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
- 0, 0, __KERNEL_CS); \
- } while (0)
-
-extern int first_system_vector;
-/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
extern unsigned long used_vectors[];
-static inline void alloc_system_vector(int vector)
-{
- if (!test_bit(vector, used_vectors)) {
- set_bit(vector, used_vectors);
- if (first_system_vector > vector)
- first_system_vector = vector;
- } else {
- BUG();
- }
-}
-
-#define alloc_intr_gate(n, addr) \
- do { \
- alloc_system_vector(n); \
- set_intr_gate(n, addr); \
- } while (0)
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_system_trap_gate(unsigned int n, void *addr)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_trap_gate(unsigned int n, void *addr)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
-}
-
-static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
-}
-
-static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
-}
-
-static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
-}
-
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u32, debug_idt_ctr);
static inline bool is_debug_idt_enabled(void)
@@ -567,31 +420,6 @@ static inline void load_debug_idt(void)
}
#endif
-#ifdef CONFIG_TRACING
-extern atomic_t trace_idt_ctr;
-static inline bool is_trace_idt_enabled(void)
-{
- if (atomic_read(&trace_idt_ctr))
- return true;
-
- return false;
-}
-
-static inline void load_trace_idt(void)
-{
- load_idt((const struct desc_ptr *)&trace_idt_descr);
-}
-#else
-static inline bool is_trace_idt_enabled(void)
-{
- return false;
-}
-
-static inline void load_trace_idt(void)
-{
-}
-#endif
-
/*
* The load_current_idt() must be called with interrupts disabled
* to avoid races. That way the IDT will always be set back to the expected
@@ -603,9 +431,25 @@ static inline void load_current_idt(void)
{
if (is_debug_idt_enabled())
load_debug_idt();
- else if (is_trace_idt_enabled())
- load_trace_idt();
else
load_idt((const struct desc_ptr *)&idt_descr);
}
+
+extern void idt_setup_early_handler(void);
+extern void idt_setup_early_traps(void);
+extern void idt_setup_traps(void);
+extern void idt_setup_apic_and_irq_gates(void);
+
+#ifdef CONFIG_X86_64
+extern void idt_setup_early_pf(void);
+extern void idt_setup_ist_traps(void);
+extern void idt_setup_debugidt_traps(void);
+#else
+static inline void idt_setup_early_pf(void) { }
+static inline void idt_setup_ist_traps(void) { }
+static inline void idt_setup_debugidt_traps(void) { }
+#endif
+
+extern void idt_invalidate(void *addr);
+
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 49265345d4d2..346d252029b7 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -11,34 +11,30 @@
#include <linux/types.h>
-/*
- * FIXME: Accessing the desc_struct through its fields is more elegant,
- * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than a union,
- * so we can get rid of it transparently in the future -- glommer
- */
/* 8 byte segment descriptor */
struct desc_struct {
- union {
- struct {
- unsigned int a;
- unsigned int b;
- };
- struct {
- u16 limit0;
- u16 base0;
- unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
- unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
- };
- };
+ u16 limit0;
+ u16 base0;
+ u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
} __attribute__((packed));
-#define GDT_ENTRY_INIT(flags, base, limit) { { { \
- .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \
- .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \
- ((limit) & 0xf0000) | ((base) & 0xff000000), \
- } } }
+#define GDT_ENTRY_INIT(flags, base, limit) \
+ { \
+ .limit0 = (u16) (limit), \
+ .limit1 = ((limit) >> 16) & 0x0F, \
+ .base0 = (u16) (base), \
+ .base1 = ((base) >> 16) & 0xFF, \
+ .base2 = ((base) >> 24) & 0xFF, \
+ .type = (flags & 0x0f), \
+ .s = (flags >> 4) & 0x01, \
+ .dpl = (flags >> 5) & 0x03, \
+ .p = (flags >> 7) & 0x01, \
+ .avl = (flags >> 12) & 0x01, \
+ .l = (flags >> 13) & 0x01, \
+ .d = (flags >> 14) & 0x01, \
+ .g = (flags >> 15) & 0x01, \
+ }
enum {
GATE_INTERRUPT = 0xE,
@@ -47,49 +43,63 @@ enum {
GATE_TASK = 0x5,
};
-/* 16byte gate */
-struct gate_struct64 {
- u16 offset_low;
- u16 segment;
- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
- u16 offset_middle;
- u32 offset_high;
- u32 zero1;
-} __attribute__((packed));
-
-#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
-#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
-
enum {
DESC_TSS = 0x9,
DESC_LDT = 0x2,
DESCTYPE_S = 0x10, /* !system */
};
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct ldttss_desc64 {
- u16 limit0;
- u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
- u32 base3;
- u32 zero1;
+/* LDT or TSS descriptor in the GDT. */
+struct ldttss_desc {
+ u16 limit0;
+ u16 base0;
+
+ u16 base1 : 8, type : 5, dpl : 2, p : 1;
+ u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+#ifdef CONFIG_X86_64
+ u32 base3;
+ u32 zero1;
+#endif
} __attribute__((packed));
+typedef struct ldttss_desc ldt_desc;
+typedef struct ldttss_desc tss_desc;
+
+struct idt_bits {
+ u16 ist : 3,
+ zero : 5,
+ type : 5,
+ dpl : 2,
+ p : 1;
+} __attribute__((packed));
+
+struct gate_struct {
+ u16 offset_low;
+ u16 segment;
+ struct idt_bits bits;
+ u16 offset_middle;
+#ifdef CONFIG_X86_64
+ u32 offset_high;
+ u32 reserved;
+#endif
+} __attribute__((packed));
+
+typedef struct gate_struct gate_desc;
+
+static inline unsigned long gate_offset(const gate_desc *g)
+{
#ifdef CONFIG_X86_64
-typedef struct gate_struct64 gate_desc;
-typedef struct ldttss_desc64 ldt_desc;
-typedef struct ldttss_desc64 tss_desc;
-#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
-#define gate_segment(g) ((g).segment)
+ return g->offset_low | ((unsigned long)g->offset_middle << 16) |
+ ((unsigned long) g->offset_high << 32);
#else
-typedef struct desc_struct gate_desc;
-typedef struct desc_struct ldt_desc;
-typedef struct desc_struct tss_desc;
-#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
-#define gate_segment(g) ((g).a >> 16)
+ return g->offset_low | ((unsigned long)g->offset_middle << 16);
#endif
+}
+
+static inline unsigned long gate_segment(const gate_desc *g)
+{
+ return g->segment;
+}
struct desc_ptr {
unsigned short size;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index a3de31ffb722..04330c8d9af9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -126,15 +126,15 @@ do { \
pr_reg[4] = regs->di; \
pr_reg[5] = regs->bp; \
pr_reg[6] = regs->ax; \
- pr_reg[7] = regs->ds & 0xffff; \
- pr_reg[8] = regs->es & 0xffff; \
- pr_reg[9] = regs->fs & 0xffff; \
+ pr_reg[7] = regs->ds; \
+ pr_reg[8] = regs->es; \
+ pr_reg[9] = regs->fs; \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
- pr_reg[13] = regs->cs & 0xffff; \
+ pr_reg[13] = regs->cs; \
pr_reg[14] = regs->flags; \
pr_reg[15] = regs->sp; \
- pr_reg[16] = regs->ss & 0xffff; \
+ pr_reg[16] = regs->ss; \
} while (0);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
+ unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@@ -226,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
- (pr_reg)[21] = current->thread.fsbase; \
- (pr_reg)[22] = current->thread.gsbase; \
+ rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
+ rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 07b06955a05d..aa15d1f7e530 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,20 +13,14 @@
BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
-BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR,
- smp_irq_move_cleanup_interrupt)
-BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR)
+BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR)
#endif
-BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
-
#ifdef CONFIG_HAVE_KVM
-BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
- smp_kvm_posted_intr_ipi)
-BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
- smp_kvm_posted_intr_wakeup_ipi)
-BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR,
- smp_kvm_posted_intr_nested_ipi)
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR)
#endif
/*
@@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR,
BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
#ifdef CONFIG_IRQ_WORK
BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..554cdb205d17 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
return 0;
}
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
if (use_xsave()) {
- copy_kernel_to_xregs(&fpstate->xsave, -1);
+ copy_kernel_to_xregs(&fpstate->xsave, mask);
} else {
if (use_fxsr())
copy_kernel_to_fxregs(&fpstate->fxsave);
@@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
: : [addr] "m" (fpstate));
}
- __copy_kernel_to_fpregs(fpstate);
+ __copy_kernel_to_fpregs(fpstate, -1);
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret, tem;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index d6dbafbd4207..6dfe366a8804 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void);
extern asmlinkage void call_function_interrupt(void);
extern asmlinkage void call_function_single_interrupt(void);
-#ifdef CONFIG_TRACING
-/* Interrupt handlers registered during init_IRQ */
-extern void trace_apic_timer_interrupt(void);
-extern void trace_x86_platform_ipi(void);
-extern void trace_error_interrupt(void);
-extern void trace_irq_work_interrupt(void);
-extern void trace_spurious_interrupt(void);
-extern void trace_thermal_interrupt(void);
-extern void trace_reschedule_interrupt(void);
-extern void trace_threshold_interrupt(void);
-extern void trace_deferred_error_interrupt(void);
-extern void trace_call_function_interrupt(void);
-extern void trace_call_function_single_interrupt(void);
-#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
-#define trace_reboot_interrupt reboot_interrupt
-#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
-#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
-#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi
-#endif /* CONFIG_TRACING */
-
#ifdef CONFIG_X86_LOCAL_APIC
struct irq_data;
struct pci_dev;
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index 597dc4995678..000000000000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-
-#ifdef CONFIG_INTEL_RDT_A
-
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/jump_label.h>
-
-#include <asm/intel_rdt_common.h>
-
-#define IA32_L3_QOS_CFG 0xc81
-#define IA32_L3_CBM_BASE 0xc90
-#define IA32_L2_CBM_BASE 0xd10
-#define IA32_MBA_THRTL_BASE 0xd50
-
-#define L3_QOS_CDP_ENABLE 0x01ULL
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn: kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid: closid for this rdtgroup
- * @cpu_mask: CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- int closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
-};
-
-/* rdtgroup.flags */
-#define RDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name: File name
- * @mode: Access mode
- * @kf_ops: File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @seq_show: Show content of the file
- * @write: Write to the file
- */
-struct rftype {
- char *name;
- umode_t mode;
- struct kernfs_ops *kf_ops;
- unsigned long flags;
-
- int (*seq_show)(struct kernfs_open_file *of,
- struct seq_file *sf, void *v);
- /*
- * write() is the generic write callback which maps directly to
- * kernfs write operation and overrides all other operations.
- * Maximum write size is determined by ->max_write_len.
- */
- ssize_t (*write)(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list: all instances of this resource
- * @id: unique id for this instance
- * @cpu_mask: which cpus share this resource
- * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl: new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
- struct list_head list;
- int id;
- struct cpumask cpu_mask;
- u32 *ctrl_val;
- u32 new_ctrl;
- bool have_new_ctrl;
-};
-
-/**
- * struct msr_param - set a range of MSRs from a domain
- * @res: The resource to use
- * @low: Beginning index from base MSR
- * @high: End index
- */
-struct msr_param {
- struct rdt_resource *res;
- int low;
- int high;
-};
-
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len: Length of the cache bit mask
- * @min_cbm_bits: Minimum number of consecutive bits to be set
- * @cbm_idx_mult: Multiplier of CBM index
- * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
- * closid * cbm_idx_multi + cbm_idx_offset
- * in a cache bit mask
- */
-struct rdt_cache {
- unsigned int cbm_len;
- unsigned int min_cbm_bits;
- unsigned int cbm_idx_mult;
- unsigned int cbm_idx_offset;
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay: Max throttle delay. Delay is the hardware
- * representation for memory bandwidth.
- * @min_bw: Minimum memory bandwidth percentage user can request
- * @bw_gran: Granularity at which the memory bandwidth is allocated
- * @delay_linear: True if memory B/W delay is in linear scale
- * @mb_map: Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
- u32 max_delay;
- u32 min_bw;
- u32 bw_gran;
- u32 delay_linear;
- u32 *mb_map;
-};
-
-/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled: Is this feature enabled on this machine
- * @capable: Is this feature available on this machine
- * @name: Name to use in "schemata" file
- * @num_closid: Number of CLOSIDs available
- * @cache_level: Which cache level defines scope of this resource
- * @default_ctrl: Specifies default cache cbm or memory B/W percent.
- * @msr_base: Base MSR address for CBMs
- * @msr_update: Function pointer to update QOS MSRs
- * @data_width: Character width of data when displaying
- * @domains: All domains for this resource
- * @cache: Cache allocation related data
- * @info_files: resctrl info files for the resource
- * @nr_info_files: Number of info files
- * @format_str: Per resource format string to show domain value
- * @parse_ctrlval: Per resource function pointer to parse control values
- */
-struct rdt_resource {
- bool enabled;
- bool capable;
- char *name;
- int num_closid;
- int cache_level;
- u32 default_ctrl;
- unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
- int data_width;
- struct list_head domains;
- struct rdt_cache cache;
- struct rdt_membw membw;
- struct rftype *info_files;
- int nr_info_files;
- const char *format_str;
- int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
- struct rdt_domain *d);
-};
-
-void rdt_get_cache_infofile(struct rdt_resource *r);
-void rdt_get_mba_infofile(struct rdt_resource *r);
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-
-extern struct mutex rdtgroup_mutex;
-
-extern struct rdt_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
-int __init rdtgroup_init(void);
-
-enum {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L3DATA,
- RDT_RESOURCE_L3CODE,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-#define for_each_capable_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
- if (r->capable)
-
-#define for_each_enabled_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
- if (r->enabled)
-
-/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
-union cpuid_0x10_1_eax {
- struct {
- unsigned int cbm_len:5;
- } split;
- unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
-union cpuid_0x10_3_eax {
- struct {
- unsigned int max_delay:12;
- } split;
- unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID).EDX */
-union cpuid_0x10_x_edx {
- struct {
- unsigned int cos_max:16;
- } split;
- unsigned int full;
-};
-
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
-void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-
-/*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
- *
- * Following considerations are made so that this has minimal impact
- * on scheduler hot path:
- * - This will stay as no-op unless we are running on an Intel SKU
- * which supports resource control and we enable by mounting the
- * resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- * when a task with a different CLOSid is scheduled in.
- *
- * Must be called with preemption disabled.
- */
-static inline void intel_rdt_sched_in(void)
-{
- if (static_branch_likely(&rdt_enable_key)) {
- struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
- int closid;
-
- /*
- * If this task has a closid assigned, use it.
- * Else use the closid assigned to this cpu.
- */
- closid = current->closid;
- if (closid == 0)
- closid = this_cpu_read(cpu_closid);
-
- if (closid != state->closid) {
- state->closid = closid;
- wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
- }
- }
-}
-
-#else
-
-static inline void intel_rdt_sched_in(void) {}
-
-#endif /* CONFIG_INTEL_RDT_A */
-#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
deleted file mode 100644
index b31081b89407..000000000000
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_COMMON_H
-#define _ASM_X86_INTEL_RDT_COMMON_H
-
-#define MSR_IA32_PQR_ASSOC 0x0c8f
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid: The cached Resource Monitoring ID
- * @closid: The cached Class Of Service ID
- * @rmid_usecnt: The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
- u32 rmid;
- u32 closid;
- int rmid_usecnt;
-};
-
-DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000000..b4bbf8b21512
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,92 @@
+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
+#define _ASM_X86_INTEL_RDT_SCHED_H
+
+#ifdef CONFIG_INTEL_RDT
+
+#include <linux/sched.h>
+#include <linux/jump_label.h>
+
+#define IA32_PQR_ASSOC 0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @cur_rmid: The cached Resource Monitoring ID
+ * @cur_closid: The cached Class Of Service ID
+ * @default_rmid: The user assigned Resource Monitoring ID
+ * @default_closid: The user assigned cached Class Of Service ID
+ *
+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 cur_rmid;
+ u32 cur_closid;
+ u32 default_rmid;
+ u32 default_closid;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+/*
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports resource control or monitoring and we enable by
+ * mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ * when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ * simple as possible.
+ * Must be called with preemption disabled.
+ */
+static void __intel_rdt_sched_in(void)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 closid = state->default_closid;
+ u32 rmid = state->default_rmid;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (static_branch_likely(&rdt_alloc_enable_key)) {
+ if (current->closid)
+ closid = current->closid;
+ }
+
+ if (static_branch_likely(&rdt_mon_enable_key)) {
+ if (current->rmid)
+ rmid = current->rmid;
+ }
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ state->cur_closid = closid;
+ state->cur_rmid = rmid;
+ wrmsr(IA32_PQR_ASSOC, rmid, closid);
+ }
+}
+
+static inline void intel_rdt_sched_in(void)
+{
+ if (static_branch_likely(&rdt_enable_key))
+ __intel_rdt_sched_in();
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT */
+
+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 4bc6f459a8b6..c40a95c33bb8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
build_mmio_write(__writew, "w", unsigned short, "r", )
build_mmio_write(__writel, "l", unsigned int, "r", )
+#define readb readb
+#define readw readw
+#define readl readl
#define readb_relaxed(a) __readb(a)
#define readw_relaxed(a) __readw(a)
#define readl_relaxed(a) __readl(a)
@@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#define __raw_readw __readw
#define __raw_readl __readl
+#define writeb writeb
+#define writew writew
+#define writel writel
#define writeb_relaxed(v, a) __writeb(v, a)
#define writew_relaxed(v, a) __writew(v, a)
#define writel_relaxed(v, a) __writel(v, a)
@@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#ifdef CONFIG_X86_64
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
-#define readq_relaxed(a) readq(a)
-#define writeq_relaxed(v, a) writeq(v, a)
+#define readq_relaxed(a) __readq(a)
+#define writeq_relaxed(v, a) __writeq(v, a)
-#define __raw_readq(a) readq(a)
-#define __raw_writeq(val, addr) writeq(val, addr)
+#define __raw_readq __readq
+#define __raw_writeq __writeq
/* Let people know that we have them */
#define readq readq
@@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
{
return __pa(address);
}
+#define virt_to_phys virt_to_phys
/**
* phys_to_virt - map physical address to virtual
@@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
{
return __va(address);
}
+#define phys_to_virt phys_to_virt
/*
* Change "struct page" to physical address.
@@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
* else, you probably want one of the following.
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+#define ioremap_prot ioremap_prot
/**
* ioremap - map bus memory into CPU space
@@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
{
return ioremap_nocache(offset, size);
}
+#define ioremap ioremap
extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
extern void set_iounmap_nonlazy(void);
@@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void);
#include <asm-generic/iomap.h>
/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-/**
- * memset_io Set a range of I/O memory to a constant value
- * @addr: The beginning of the I/O-memory range to set
- * @val: The value to set the memory to
- * @count: The number of bytes to set
- *
- * Set a range of I/O memory to a given value.
- */
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
-{
- memset((void __force *)addr, val, count);
-}
-
-/**
- * memcpy_fromio Copy a block of data from I/O memory
- * @dst: The (RAM) destination for the copy
- * @src: The (I/O memory) source for the data
- * @count: The number of bytes to copy
- *
- * Copy a block of data from I/O memory.
- */
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
-{
- memcpy(dst, (const void __force *)src, count);
-}
-
-/**
- * memcpy_toio Copy a block of data into I/O memory
- * @dst: The (I/O memory) destination for the copy
- * @src: The (RAM) source for the data
- * @count: The number of bytes to copy
- *
- * Copy a block of data to I/O memory.
- */
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
-{
- memcpy((void __force *)dst, src, count);
-}
-
-/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
* to PAGE_OFFSET is pure coincidence - it does not mean ISA values
@@ -341,13 +309,38 @@ BUILDIO(b, b, char)
BUILDIO(w, w, short)
BUILDIO(l, , int)
+#define inb inb
+#define inw inw
+#define inl inl
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb outb
+#define outw outw
+#define outl outl
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
extern void *xlate_dev_mem_ptr(phys_addr_t phys);
extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
extern bool is_early_ioremap_ptep(pte_t *ptep);
@@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
#define IO_SPACE_LIMIT 0xffff
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
#ifdef CONFIG_MTRR
extern int __must_check arch_phys_wc_index(int handle);
#define arch_phys_wc_index arch_phys_wc_index
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 668cca540025..9958ceea2fa3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
extern __visible unsigned int do_IRQ(struct pt_regs *regs);
-/* Interrupt vector management */
-extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-extern int vector_used_by_percpu_irq(unsigned int vector);
-
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index f70604125286..ddbb8ea0f5a9 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -3,9 +3,17 @@
#include <asm/cpufeature.h>
+#ifdef CONFIG_X86_LOCAL_APIC
static inline bool arch_irq_work_has_interrupt(void)
{
return boot_cpu_has(X86_FEATURE_APIC);
}
+extern void arch_irq_work_raise(void);
+#else
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return false;
+}
+#endif
#endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7cbaab523f22..369e41c23f07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -492,6 +492,7 @@ struct kvm_vcpu_arch {
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
+ u32 pkru;
u32 hflags;
u64 efer;
u64 apic_base;
@@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS 10
-#define GDT_ENTRY_LGUEST_DS 11
-#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
- /* Manually saved part. */
- unsigned long eax, ebx, ecx, edx;
- unsigned long esi, edi, ebp;
- unsigned long gs;
- unsigned long fs, ds, es;
- unsigned long trapnum, errcode;
- /* Trap pushed part */
- unsigned long eip;
- unsigned long cs;
- unsigned long eflags;
- unsigned long esp;
- unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
- /* Host information we need to restore when we switch back. */
- u32 host_cr3;
- struct desc_ptr host_idt_desc;
- struct desc_ptr host_gdt_desc;
- u32 host_sp;
-
- /* Fields which are used when guest is running. */
- struct desc_ptr guest_idt_desc;
- struct desc_ptr guest_gdt_desc;
- struct x86_hw_tss guest_tss;
- struct desc_struct guest_idt[IDT_ENTRIES];
- struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
- /* The GDT entries copied into lguest_ro_state when running. */
- struct desc_struct gdt[GDT_ENTRIES];
-
- /* The IDT entries: some copied into lguest_ro_state when running. */
- struct desc_struct idt[IDT_ENTRIES];
-
- /* The address of the last guest-visible pagefault (ie. cr2). */
- unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
- u32 cr0;
-
- cr0 = read_cr0();
- if (!(cr0 & 8))
- write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
- ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC 0
-#define LHCALL_LGUEST_INIT 1
-#define LHCALL_SHUTDOWN 2
-#define LHCALL_NEW_PGTABLE 4
-#define LHCALL_FLUSH_TLB 5
-#define LHCALL_LOAD_IDT_ENTRY 6
-#define LHCALL_SET_STACK 7
-#define LHCALL_SET_CLOCKEVENT 9
-#define LHCALL_HALT 10
-#define LHCALL_SET_PMD 13
-#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PGD 15
-#define LHCALL_LOAD_TLS 16
-#define LHCALL_LOAD_GDT_ENTRY 18
-#define LHCALL_SEND_INTERRUPTS 19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF 1
-#define LGUEST_SHUTDOWN_RESTART 2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations? There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts). Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure. This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
- unsigned long arg1, unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* "int" is the Intel instruction to trigger a trap. */
- asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
- /* The call in %eax (aka "a") might be overwritten */
- : "=a"(call)
- /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
- : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
- /* "memory" means this might write somewhere in memory.
- * This isn't true for all calls, but it's safe to tell
- * gcc that it might happen so it doesn't get clever. */
- : "memory");
- return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
- /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d25d9f4abb15..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -148,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
- init_new_context_ldt(tsk, mm);
-
- return 0;
+ return init_new_context_ldt(tsk, mm);
}
static inline void destroy_context(struct mm_struct *mm)
{
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819caeef..9eb7c718aaf8 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -2,6 +2,15 @@
#define _ASM_X86_MODULE_H
#include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_ORC_UNWINDER
+ unsigned int num_orcs;
+ int *orc_unwind_ip;
+ struct orc_entry *orc_unwind;
+#endif
+};
#ifdef CONFIG_X86_64
/* X86_64 does not define MODULE_PROC_FAMILY */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 0d4b01c5e438..63cc96f064dc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -30,6 +30,8 @@ struct ms_hyperv_info {
u32 features;
u32 misc_features;
u32 hints;
+ u32 max_vp_index;
+ u32 max_lp_index;
};
extern struct ms_hyperv_info ms_hyperv;
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..91c8d868424d
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range. An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER 8
+#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP (unsigned long)_stext
+#define LOOKUP_STOP_IP (unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..9c9dc579bd7d
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED 0
+#define ORC_REG_PREV_SP 1
+#define ORC_REG_DX 2
+#define ORC_REG_DI 3
+#define ORC_REG_BP 4
+#define ORC_REG_SP 5
+#define ORC_REG_R10 6
+#define ORC_REG_R13 7
+#define ORC_REG_BP_INDIRECT 8
+#define ORC_REG_SP_INDIRECT 9
+#define ORC_REG_MAX 15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL 0
+#define ORC_TYPE_REGS 1
+#define ORC_TYPE_REGS_IRET 2
+#define UNWIND_HINT_TYPE_SAVE 3
+#define UNWIND_HINT_TYPE_RESTORE 4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard. It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address. Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+ s16 sp_offset;
+ s16 bp_offset;
+ unsigned sp_reg:4;
+ unsigned bp_reg:4;
+ unsigned type:2;
+} __packed;
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..c25dd22f7c70 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -960,11 +960,6 @@ extern void default_banner(void);
#define GET_CR2_INTO_RAX \
call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
- CLBR_NONE, \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
-
#define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9ffc36bfe4cd..6b64fc6367f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -196,9 +196,6 @@ struct pv_irq_ops {
void (*safe_halt)(void);
void (*halt)(void);
-#ifdef CONFIG_X86_64
- void (*adjust_exception_frame)(void);
-#endif
} __no_randomize_layout;
struct pv_mmu_ops {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c61bab07a84e..3fa26a61eabc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -667,7 +668,7 @@ static inline void sync_core(void)
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
- * but it will fault at CPL3 (i.e. Xen PV and lguest).
+ * but it will fault at CPL3 (i.e. Xen PV).
*
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
@@ -690,6 +691,7 @@ static inline void sync_core(void)
unsigned int tmp;
asm volatile (
+ UNWIND_HINT_SAVE
"mov %%ss, %0\n\t"
"pushq %q0\n\t"
"pushq %%rsp\n\t"
@@ -699,6 +701,7 @@ static inline void sync_core(void)
"pushq %q0\n\t"
"pushq $1f\n\t"
"iretq\n\t"
+ UNWIND_HINT_RESTORE
"1:"
: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
#endif
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 8d3964fc5f91..b408b1886195 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_INT80_compat(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+void xen_entry_INT80_compat(void);
+#endif
#endif
void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..91c04c8e67fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -9,6 +9,20 @@
#ifdef __i386__
struct pt_regs {
+ /*
+ * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+ * following cases (where %seg represents a segment register):
+ *
+ * - pushl %seg: some do a 16-bit write and leave the high
+ * bits alone
+ * - movl %seg, [mem]: some do a 16-bit write despite the movl
+ * - IDT entry: some (e.g. 486) will leave the high bits of CS
+ * and (if applicable) SS undefined.
+ *
+ * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+ * so we can just treat all of the segment registers as 16-bit
+ * values.
+ */
unsigned long bx;
unsigned long cx;
unsigned long dx;
@@ -16,16 +30,22 @@ struct pt_regs {
unsigned long di;
unsigned long bp;
unsigned long ax;
- unsigned long ds;
- unsigned long es;
- unsigned long fs;
- unsigned long gs;
+ unsigned short ds;
+ unsigned short __dsh;
+ unsigned short es;
+ unsigned short __esh;
+ unsigned short fs;
+ unsigned short __fsh;
+ unsigned short gs;
+ unsigned short __gsh;
unsigned long orig_ax;
unsigned long ip;
- unsigned long cs;
+ unsigned short cs;
+ unsigned short __csh;
unsigned long flags;
unsigned long sp;
- unsigned long ss;
+ unsigned short ss;
+ unsigned short __ssh;
};
#else /* __i386__ */
@@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
if (offset == offsetof(struct pt_regs, sp) &&
regs->cs == __KERNEL_CS)
return kernel_stack_pointer(regs);
+
+ /* The selector fields are 16-bit. */
+ if (offset == offsetof(struct pt_regs, cs) ||
+ offset == offsetof(struct pt_regs, ss) ||
+ offset == offsetof(struct pt_regs, ds) ||
+ offset == offsetof(struct pt_regs, es) ||
+ offset == offsetof(struct pt_regs, fs) ||
+ offset == offsetof(struct pt_regs, gs)) {
+ return *(u16 *)((unsigned long)regs + offset);
+
+ }
#endif
return *(unsigned long *)((unsigned long)regs + offset);
}
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION \
+ ".pushsection .text.unlikely\n" \
+ "111:\tlea %[counter], %%" _ASM_CX "\n" \
+ "112:\t" ASM_UD0 "\n" \
+ ASM_UNREACHABLE \
+ ".popsection\n" \
+ "113:\n" \
+ _ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO \
+ "js 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO \
+ "jz 111f\n\t" \
+ REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR \
+ "jmp 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : "ir" (i)
+ : "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "incl %0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "decl %0\n\t"
+ REFCOUNT_CHECK_LE_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+ GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "%0", e);
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+ int c, result;
+
+ c = atomic_read(&(r->refs));
+ do {
+ if (unlikely(c == 0))
+ return false;
+
+ result = c + i;
+
+ /* Did we try to increment from/to an undesirable state? */
+ if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+ asm volatile(REFCOUNT_ERROR
+ : : [counter] "m" (r->refs.counter)
+ : "cc", "cx");
+ break;
+ }
+
+ } while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+ return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+ return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..045f99211a99 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,45 +1,56 @@
#ifndef _ASM_X86_RMWcc
#define _ASM_X86_RMWcc
+#define __CLOBBERS_MEM "memory"
+#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx"
+
#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
/* Use asm goto */
-#define __GEN_RMWcc(fullop, var, cc, ...) \
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
- : : "m" (var), ## __VA_ARGS__ \
- : "memory" : cc_label); \
+ : : [counter] "m" (var), ## __VA_ARGS__ \
+ : clobbers : cc_label); \
return 0; \
cc_label: \
return 1; \
} while (0)
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
- __GEN_RMWcc(op " " arg0, var, cc)
+#define __BINARY_RMWcc_ARG " %1, "
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
- __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
/* Use flags output or a set instruction */
-#define __GEN_RMWcc(fullop, var, cc, ...) \
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
bool c; \
asm volatile (fullop ";" CC_SET(cc) \
- : "+m" (var), CC_OUT(cc) (c) \
- : __VA_ARGS__ : "memory"); \
+ : [counter] "+m" (var), CC_OUT(cc) (c) \
+ : __VA_ARGS__ : clobbers); \
return c; \
} while (0)
+#define __BINARY_RMWcc_ARG " %2, "
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
- __GEN_RMWcc(op " " arg0, var, cc)
+ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \
+ __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM_CC_CX)
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
- __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
+ __CLOBBERS_MEM, vcon (val))
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM_CC_CX, vcon (val))
#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 1549caa098f0..066aaf813141 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -238,9 +238,7 @@
#ifndef __ASSEMBLY__
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
-#ifdef CONFIG_TRACING
-# define trace_early_idt_handler_array early_idt_handler_array
-#endif
+extern void early_ignore_irq(void);
/*
* Load a segment. Fall back on loading the zero segment if something goes
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index e4585a393965..a65cf544686a 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -39,6 +39,7 @@ static inline void vsmp_init(void) { }
#endif
void setup_bios_corruption_check(void);
+void early_platform_quirks(void);
extern unsigned long saved_video_mode;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
_TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \
_TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \
- _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+ _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_FSCHECK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d23e61dc0640..4893abf7f74f 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -198,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
cr4_set_bits(mask);
}
+extern void initialize_tlbstate_and_flush(void);
+
static inline void __native_flush_tlb(void)
{
/*
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6358a85e2270..c1d2a9892352 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
extern void setup_node_to_cpumask_map(void);
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
#define pcibus_to_node(bus) __pcibus_to_node(bus)
extern int __node_distance(int, int);
diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h
new file mode 100644
index 000000000000..57c8da027d99
--- /dev/null
+++ b/arch/x86/include/asm/trace/common.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_TRACE_COMMON_H
+#define _ASM_TRACE_COMMON_H
+
+#ifdef CONFIG_TRACING
+DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
+#define trace_pagefault_enabled() \
+ static_branch_unlikely(&trace_pagefault_key)
+DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+#define trace_resched_ipi_enabled() \
+ static_branch_unlikely(&trace_resched_ipi_key)
+#else
+static inline bool trace_pagefault_enabled(void) { return false; }
+static inline bool trace_resched_ipi_enabled(void) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2422b14c50a7..5665bf205b8d 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -5,9 +5,10 @@
#define _TRACE_PAGE_FAULT_H
#include <linux/tracepoint.h>
+#include <asm/trace/common.h>
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+extern int trace_pagefault_reg(void);
+extern void trace_pagefault_unreg(void);
DECLARE_EVENT_CLASS(x86_exceptions,
@@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \
TP_PROTO(unsigned long address, struct pt_regs *regs, \
unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
- trace_irq_vector_regfunc, \
- trace_irq_vector_unregfunc);
+ trace_pagefault_reg, trace_pagefault_unreg);
DEFINE_PAGE_FAULT_EVENT(page_fault_user);
DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 32dd6a9e343c..1599d394c8c1 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -5,9 +5,12 @@
#define _TRACE_IRQ_VECTORS_H
#include <linux/tracepoint.h>
+#include <asm/trace/common.h>
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int trace_resched_ipi_reg(void);
+extern void trace_resched_ipi_unreg(void);
DECLARE_EVENT_CLASS(x86_irq_vector,
@@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector,
#define DEFINE_IRQ_VECTOR_EVENT(name) \
DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL); \
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL);
+
+#define DEFINE_RESCHED_IPI_EVENT(name) \
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
+ TP_PROTO(int vector), \
TP_ARGS(vector), \
- trace_irq_vector_regfunc, \
- trace_irq_vector_unregfunc); \
+ trace_resched_ipi_reg, \
+ trace_resched_ipi_unreg); \
DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
TP_PROTO(int vector), \
TP_ARGS(vector), \
- trace_irq_vector_regfunc, \
- trace_irq_vector_unregfunc);
-
+ trace_resched_ipi_reg, \
+ trace_resched_ipi_unreg);
/*
* local_timer - called when entering/exiting a local timer interrupt
@@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
DEFINE_IRQ_VECTOR_EVENT(local_timer);
/*
- * reschedule - called when entering/exiting a reschedule vector handler
- */
-DEFINE_IRQ_VECTOR_EVENT(reschedule);
-
-/*
* spurious_apic - called when entering/exiting a spurious apic vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(spurious_apic);
@@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic);
*/
DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);
+#ifdef CONFIG_IRQ_WORK
/*
* irq_work - called when entering/exiting a irq work interrupt
* vector handler
@@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work);
* 4) goto 1
*/
TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
+#endif
+
+/*
+ * The ifdef is required because that tracepoint macro hell emits tracepoint
+ * code in files which include this header even if the tracepoint is not
+ * enabled. Brilliant stuff that.
+ */
+#ifdef CONFIG_SMP
+/*
+ * reschedule - called when entering/exiting a reschedule vector handler
+ */
+DEFINE_RESCHED_IPI_EVENT(reschedule);
/*
* call_function - called when entering/exiting a call function interrupt
@@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function);
* single interrupt vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(call_function_single);
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
/*
* threshold_apic - called when entering/exiting a threshold apic interrupt
* vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
+#endif
+#ifdef CONFIG_X86_MCE_AMD
/*
* deferred_error_apic - called when entering/exiting a deferred apic interrupt
* vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
/*
* thermal_apic - called when entering/exiting a thermal apic interrupt
* vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
+#endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7f48cd..5545f6459bf5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,9 +13,6 @@ asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
-asmlinkage void xen_debug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_stack_segment(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
@@ -38,22 +35,29 @@ asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
-#ifdef CONFIG_TRACING
-asmlinkage void trace_page_fault(void);
-#define trace_stack_segment stack_segment
-#define trace_divide_error divide_error
-#define trace_bounds bounds
-#define trace_invalid_op invalid_op
-#define trace_device_not_available device_not_available
-#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
-#define trace_invalid_TSS invalid_TSS
-#define trace_segment_not_present segment_not_present
-#define trace_general_protection general_protection
-#define trace_spurious_interrupt_bug spurious_interrupt_bug
-#define trace_coprocessor_error coprocessor_error
-#define trace_alignment_check alignment_check
-#define trace_simd_coprocessor_error simd_coprocessor_error
-#define trace_async_page_fault async_page_fault
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xendebug(void);
+asmlinkage void xen_xenint3(void);
+asmlinkage void xen_nmi(void);
+asmlinkage void xen_overflow(void);
+asmlinkage void xen_bounds(void);
+asmlinkage void xen_invalid_op(void);
+asmlinkage void xen_device_not_available(void);
+asmlinkage void xen_double_fault(void);
+asmlinkage void xen_coprocessor_segment_overrun(void);
+asmlinkage void xen_invalid_TSS(void);
+asmlinkage void xen_segment_not_present(void);
+asmlinkage void xen_stack_segment(void);
+asmlinkage void xen_general_protection(void);
+asmlinkage void xen_page_fault(void);
+asmlinkage void xen_spurious_interrupt_bug(void);
+asmlinkage void xen_coprocessor_error(void);
+asmlinkage void xen_alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void xen_machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void xen_simd_coprocessor_error(void);
#endif
dotraplinkage void do_divide_error(struct pt_regs *, long);
@@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
-#ifdef CONFIG_TRACING
-dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
-#else
-static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
-{
- do_page_fault(regs, error);
-}
-#endif
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 30269dafec47..184eb9894dae 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
-#define set_fs(x) (current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+}
#define segment_eq(a, b) ((a).seg == (b).seg)
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..e9f793e2df7a 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,14 @@ struct unwind_state {
struct task_struct *task;
int graph_idx;
bool error;
-#ifdef CONFIG_FRAME_POINTER
+#if defined(CONFIG_ORC_UNWINDER)
+ bool signal, full_regs;
+ unsigned long sp, bp, ip;
+ struct pt_regs *regs;
+#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
bool got_irq;
- unsigned long *bp, *orig_sp;
+ unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
- unsigned long ip;
#else
unsigned long *sp;
#endif
@@ -24,41 +27,30 @@ struct unwind_state {
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame);
-
bool unwind_next_frame(struct unwind_state *state);
-
unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
static inline bool unwind_done(struct unwind_state *state)
{
return state->stack_info.type == STACK_TYPE_UNKNOWN;
}
-static inline
-void unwind_start(struct unwind_state *state, struct task_struct *task,
- struct pt_regs *regs, unsigned long *first_frame)
-{
- first_frame = first_frame ? : get_stack_pointer(task, regs);
-
- __unwind_start(state, task, regs, first_frame);
-}
-
static inline bool unwind_error(struct unwind_state *state)
{
return state->error;
}
-#ifdef CONFIG_FRAME_POINTER
-
static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
{
- if (unwind_done(state))
- return NULL;
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
- return state->regs ? &state->regs->ip : state->bp + 1;
+ __unwind_start(state, task, regs, first_frame);
}
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
return state->regs;
}
-
-#else /* !CONFIG_FRAME_POINTER */
-
-static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
return NULL;
}
+#endif
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+#ifdef CONFIG_ORC_UNWINDER
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
{
- return NULL;
+#ifdef CONFIG_SMP
+ return task != current && task->on_cpu;
+#else
+ return false;
+#endif
}
-#endif /* CONFIG_FRAME_POINTER */
-
#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..bae46fc6b9de
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest. The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack. Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
+ * category. In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer. Such
+ * code needs to be annotated such that objtool can understand it. The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction. Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary. It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+ .pushsection .discard.unwind_hints
+ /* struct unwind_hint */
+ .long .Lunwind_hint_ip_\@ - .
+ .short \sp_offset
+ .byte \sp_reg
+ .byte \type
+ .popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+ .if \base == %rsp
+ .if \indirect
+ .set sp_reg, ORC_REG_SP_INDIRECT
+ .else
+ .set sp_reg, ORC_REG_SP
+ .endif
+ .elseif \base == %rbp
+ .set sp_reg, ORC_REG_BP
+ .elseif \base == %rdi
+ .set sp_reg, ORC_REG_DI
+ .elseif \base == %rdx
+ .set sp_reg, ORC_REG_DX
+ .elseif \base == %r10
+ .set sp_reg, ORC_REG_R10
+ .else
+ .error "UNWIND_HINT_REGS: bad base register"
+ .endif
+
+ .set sp_offset, \offset
+
+ .if \iret
+ .set type, ORC_TYPE_REGS_IRET
+ .elseif \extra == 0
+ .set type, ORC_TYPE_REGS_IRET
+ .set sp_offset, \offset + (16*8)
+ .else
+ .set type, ORC_TYPE_REGS
+ .endif
+
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+ UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+ UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type) \
+ "987: \n\t" \
+ ".pushsection .discard.unwind_hints\n\t" \
+ /* struct unwind_hint */ \
+ ".long 987b - .\n\t" \
+ ".short " __stringify(sp_offset) "\n\t" \
+ ".byte " __stringify(sp_reg) "\n\t" \
+ ".byte " __stringify(type) "\n\t" \
+ ".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 11071fcd630e..9606688caa4b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,6 +552,8 @@ static inline void
MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
{
+ u32 *p = (u32 *) &desc;
+
mcl->op = __HYPERVISOR_update_descriptor;
if (sizeof(maddr) == sizeof(long)) {
mcl->args[0] = maddr;
@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
} else {
mcl->args[0] = maddr;
mcl->args[1] = maddr >> 32;
- mcl->args[2] = desc.a;
- mcl->args[3] = desc.b;
+ mcl->args[2] = *p++;
+ mcl->args[3] = *p;
}
trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
*
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
-#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
-
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/*
* Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..fd0a7895b63f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
obj-$(CONFIG_COMPAT) += signal_compat.o
-obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
@@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-ifdef CONFIG_FRAME_POINTER
-obj-y += unwind_frame.o
-else
-obj-y += unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 97bb2caf3428..f8ae286c1502 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -118,7 +118,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
* This is just a simple wrapper around early_memremap(),
* with sanity checks for phys == 0 and size == 0.
*/
-char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
{
if (!phys || !size)
@@ -127,7 +127,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
return early_memremap(phys, size);
}
-void __init __acpi_unmap_table(char *map, unsigned long size)
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
{
if (!map || !size)
return;
@@ -199,8 +199,10 @@ static int __init
acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
{
struct acpi_madt_local_x2apic *processor = NULL;
+#ifdef CONFIG_X86_X2APIC
int apic_id;
u8 enabled;
+#endif
processor = (struct acpi_madt_local_x2apic *)header;
@@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
acpi_table_print_madt_entry(header);
+#ifdef CONFIG_X86_X2APIC
apic_id = processor->local_apic_id;
enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
-#ifdef CONFIG_X86_X2APIC
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -1083,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void)
mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
#endif
set_bit(MP_ISA_BUS, mp_bus_not_pci);
- pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
+ pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs());
/*
* Use the default configuration for the IRQs 0-15. Unless
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32e14d137416..3344d3382e91 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
int poke_int3_handler(struct pt_regs *regs)
{
- /* bp_patching_in_progress */
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * bp_patching_in_progress.
+ *
+ * in_progress = TRUE INT3
+ * WMB RMB
+ * write INT3 if (in_progress)
+ *
+ * Idem for bp_int3_handler.
+ */
smp_rmb();
if (likely(!bp_patching_in_progress))
@@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
/*
- * Corresponding read barrier in int3 notifier for
- * making sure the in_progress flags is correctly ordered wrt.
- * patching
+ * Corresponding read barrier in int3 notifier for making sure the
+ * in_progress and handler are correctly ordered wrt. patching.
*/
smp_wmb();
@@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
text_poke(addr, opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
-
+ /*
+ * sync_core() implies an smp_mb() and orders this store against
+ * the writing of the new instruction.
+ */
bp_patching_in_progress = false;
- smp_wmb();
return addr;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98b3dd8cf2bf..7834f73efbf1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -177,8 +177,6 @@ static int disable_apic_timer __initdata;
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-int first_system_vector = FIRST_SYSTEM_VECTOR;
-
/*
* Debug level, exported for io_apic.c
*/
@@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = {
static void apic_check_deadline_errata(void)
{
- const struct x86_cpu_id *m = x86_match_cpu(deadline_match);
+ const struct x86_cpu_id *m;
u32 rev;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ m = x86_match_cpu(deadline_match);
if (!m)
return;
@@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void)
*/
static void local_apic_timer_interrupt(void)
{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
/*
* Normally we should not be here till LAPIC has been initialized but
@@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;
@@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
* interrupt lock, which is the WrongThing (tm) to do.
*/
entering_ack_irq();
- local_apic_timer_interrupt();
- exiting_irq();
-
- set_irq_regs(old_regs);
-}
-
-__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- *
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- entering_ack_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
@@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address)
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-static void __smp_spurious_interrupt(u8 vector)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
{
+ u8 vector = ~regs->orig_ax;
u32 v;
+ entering_irq();
+ trace_spurious_apic_entry(vector);
+
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector)
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
"should never happen.\n", vector, smp_processor_id());
-}
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_spurious_interrupt(~regs->orig_ax);
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
-{
- u8 vector = ~regs->orig_ax;
-
- entering_irq();
- trace_spurious_apic_entry(vector);
- __smp_spurious_interrupt(vector);
trace_spurious_apic_exit(vector);
exiting_irq();
}
@@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-static void __smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
{
- u32 v;
- u32 i = 0;
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
"Receive CS error", /* APIC Error Bit 1 */
@@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs)
"Received illegal vector", /* APIC Error Bit 6 */
"Illegal register address", /* APIC Error Bit 7 */
};
+ u32 v, i = 0;
+
+ entering_irq();
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
@@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT "\n");
-}
-
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_error_interrupt(regs);
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- trace_error_apic_entry(ERROR_APIC_VECTOR);
- __smp_error_interrupt(regs);
trace_error_apic_exit(ERROR_APIC_VECTOR);
exiting_irq();
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 237e9c2341c7..70e48aa6af98 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
entry.vector, entry.irr, entry.delivery_status);
if (ir_entry->format)
printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf, (ir_entry->index << 15) | ir_entry->index,
+ buf, (ir_entry->index2 << 15) | ir_entry->index,
ir_entry->zero);
else
printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b3af457ed667..88c214e75a6b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
offset = current_offset;
next:
vector += 16;
- if (vector >= first_system_vector) {
+ if (vector >= FIRST_SYSTEM_VECTOR) {
offset = (offset + 1) % 16;
vector = FIRST_EXTERNAL_VECTOR + offset;
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
#include <asm/ucontext.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
OFFSET(stack_canary_offset, stack_canary, canary);
#endif
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 99332f550c48..cf42206926af 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,7 +20,6 @@ static char syscalls_ia32[] = {
int main(void)
{
#ifdef CONFIG_PARAVIRT
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 110ca5d2bb87..9862e2cd6d93 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -297,13 +297,29 @@ static int nearby_node(int apicid)
}
#endif
+#ifdef CONFIG_SMP
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+ u32 cus_per_node;
+
+ if (c->x86 >= 0x17)
+ return;
+
+ cus_per_node = c->x86_max_cores / nodes_per_socket;
+ c->cpu_core_id %= cus_per_node;
+}
+
/*
* Fixup core topology information for
* (1) AMD multi-node processors
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u8 node_id;
@@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
} else
return;
- /* fixup multi-node processor information */
if (nodes_per_socket > 1) {
- u32 cus_per_node;
-
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cus_per_node = c->x86_max_cores / nodes_per_socket;
-
- /* core id has to be in the [0 .. cores_per_node - 1] range */
- c->cpu_core_id %= cus_per_node;
+ legacy_fixup_core_id(c);
}
}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b95cd94ca97b..fb1d3358a4af 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
+ /*
+ * We'd like to use cr4_set_bits_and_update_boot(),
+ * but we can't. CR4.PCIDE is special and can only
+ * be set in long mode, and the early CPU init code
+ * doesn't know this and would try to restore CR4.PCIDE
+ * prior to entering long mode.
+ *
+ * Instead, we rely on the fact that hotplug, resume,
+ * etc all fully restore CR4 before they write anything
+ * that could have nonzero PCID bits to CR3. CR4.PCIDE
+ * has no effect on the page tables themselves, so we
+ * don't need it to be restored early.
+ */
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
@@ -1329,15 +1342,6 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr __ro_after_init = {
- .size = NR_VECTORS * 16 - 1,
- .address = (unsigned long) idt_table,
-};
-const struct desc_ptr debug_idt_descr = {
- .size = NR_VECTORS * 16 - 1,
- .address = (unsigned long) debug_idt_table,
-};
-
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1592,6 +1596,7 @@ void cpu_init(void)
mmgrab(&init_mm);
me->active_mm = &init_mm;
BUG_ON(me->mm);
+ initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
@@ -1646,6 +1651,7 @@ void cpu_init(void)
mmgrab(&init_mm);
curr->active_mm = &init_mm;
BUG_ON(curr->mm);
+ initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c55fb2cb2acc..24f749324c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct cacheinfo *this_leaf;
int i, sibling;
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+ this_leaf = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
this_leaf = this_cpu_ci->info_list + index;
@@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
&this_leaf->shared_cpu_map);
}
}
- } else if (index == 3) {
- for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- this_cpu_ci = get_cpu_cacheinfo(i);
- if (!this_cpu_ci->info_list)
- continue;
- this_leaf = this_cpu_ci->info_list + index;
- for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
- if (!cpu_online(sibling))
- continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
- }
- }
} else
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
#include <linux/cpuhotplug.h>
#include <asm/intel-family.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
@@ -38,7 +39,13 @@
/* Mutex to protect rdtgroup access. */
DEFINE_MUTEX(rdtgroup_mutex);
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Used to store the max resource name width and max resource data width
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
*/
int max_name_width, max_data_width;
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
static void
mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
static void
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
struct rdt_resource rdt_resources_all[] = {
+ [RDT_RESOURCE_L3] =
{
+ .rid = RDT_RESOURCE_L3,
.name = "L3",
.domains = domain_init(RDT_RESOURCE_L3),
.msr_base = IA32_L3_CBM_BASE,
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L3DATA] =
{
+ .rid = RDT_RESOURCE_L3DATA,
.name = "L3DATA",
.domains = domain_init(RDT_RESOURCE_L3DATA),
.msr_base = IA32_L3_CBM_BASE,
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L3CODE] =
{
+ .rid = RDT_RESOURCE_L3CODE,
.name = "L3CODE",
.domains = domain_init(RDT_RESOURCE_L3CODE),
.msr_base = IA32_L3_CBM_BASE,
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_L2] =
{
+ .rid = RDT_RESOURCE_L2,
.name = "L2",
.domains = domain_init(RDT_RESOURCE_L2),
.msr_base = IA32_L2_CBM_BASE,
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
+ [RDT_RESOURCE_MBA] =
{
+ .rid = RDT_RESOURCE_MBA,
.name = "MB",
.domains = domain_init(RDT_RESOURCE_MBA),
.msr_base = IA32_MBA_THRTL_BASE,
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
.cache_level = 3,
.parse_ctrlval = parse_bw,
.format_str = "%d=%*d",
+ .fflags = RFTYPE_RES_MB,
},
};
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
* is always 20 on hsw server parts. The minimum cache bitmask length
* allowed for HSW server is always 2 bits. Hardcode all of them.
*/
-static inline bool cache_alloc_hsw_probe(void)
+static inline void cache_alloc_hsw_probe(void)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
- u32 l, h, max_cbm = BIT_MASK(20) - 1;
-
- if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
- return false;
- rdmsr(IA32_L3_CBM_BASE, l, h);
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
- /* If all the bits were set in MSR, return success */
- if (l != max_cbm)
- return false;
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
- r->num_closid = 4;
- r->default_ctrl = max_cbm;
- r->cache.cbm_len = 20;
- r->cache.min_cbm_bits = 2;
- r->capable = true;
- r->enabled = true;
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return;
- return true;
- }
+ r->num_closid = 4;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
- return false;
+ rdt_alloc_capable = true;
}
/*
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
return false;
}
r->data_width = 3;
- rdt_get_mba_infofile(r);
- r->capable = true;
- r->enabled = true;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
return true;
}
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
r->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & r->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
- rdt_get_cache_infofile(r);
- r->capable = true;
- r->enabled = true;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
}
static void rdt_get_cdp_l3_config(int type)
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
- r->capable = true;
+ r->alloc_capable = true;
/*
* By default, CDP is disabled. CDP can be enabled by mount parameter
* "cdp" during resctrl file system mount time.
*/
- r->enabled = false;
+ r->alloc_enabled = false;
}
static int get_cache_id(int cpu, int level)
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
}
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
void rdt_ctrl_update(void *arg)
{
struct msr_param *m = arg;
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
int cpu = smp_processor_id();
struct rdt_domain *d;
- list_for_each_entry(d, &r->domains, list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- r->msr_update(d, m, r);
- return;
- }
+ d = get_domain_from_cpu(cpu, r);
+ if (d) {
+ r->msr_update(d, m, r);
+ return;
}
pr_warn_once("cpu %d not found in any domain for resource %s\n",
cpu, r->name);
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
* caller, return the first domain whose id is bigger than the input id.
* The domain list is sorted by id in ascending order.
*/
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
{
struct rdt_domain *d;
struct list_head *l;
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+ size_t tsize;
+
+ if (is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+ }
+ if (is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ kfree(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ }
+
+ return 0;
+}
+
/*
* domain_add_cpu - Add a cpu to a resource's domain list.
*
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d->id = id;
+ cpumask_set_cpu(cpu, &d->cpu_mask);
- if (domain_setup_ctrlval(r, d)) {
+ if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+ kfree(d);
+ return;
+ }
+
+ if (r->mon_capable && domain_setup_mon_state(r, d)) {
kfree(d);
return;
}
- cpumask_set_cpu(cpu, &d->cpu_mask);
list_add_tail(&d->list, add_pos);
+
+ /*
+ * If resctrl is mounted, add
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ mkdir_mondata_subdir_allrdtgrp(r, d);
}
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
list_del(&d->list);
+ if (is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
kfree(d);
+ return;
+ }
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(r, d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0);
+ }
}
}
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
{
struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
- per_cpu(cpu_closid, cpu) = 0;
- state->closid = 0;
- wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+ state->default_closid = 0;
+ state->default_rmid = 0;
+ state->cur_closid = 0;
+ state->cur_rmid = 0;
+ wrmsr(IA32_PQR_ASSOC, 0, 0);
}
static int intel_rdt_online_cpu(unsigned int cpu)
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
- clear_closid(cpu);
+ clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex);
return 0;
}
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+ break;
+ }
+ }
+}
+
static int intel_rdt_offline_cpu(unsigned int cpu)
{
struct rdtgroup *rdtgrp;
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
break;
+ }
}
- clear_closid(cpu);
+ clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex);
return 0;
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
struct rdt_resource *r;
int cl;
- for_each_capable_rdt_resource(r) {
+ for_each_alloc_capable_rdt_resource(r) {
cl = strlen(r->name);
if (cl > max_name_width)
max_name_width = cl;
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
}
}
-static __init bool get_rdt_resources(void)
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __initdata = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+static __init bool get_rdt_alloc_resources(void)
{
bool ret = false;
- if (cache_alloc_hsw_probe())
+ if (rdt_alloc_capable)
return true;
if (!boot_cpu_has(X86_FEATURE_RDT_A))
return false;
- if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
}
ret = true;
}
- if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
ret = true;
}
- if (boot_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_cpu_has(X86_FEATURE_MBA)) {
if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
ret = true;
}
-
return ret;
}
+static __init bool get_rdt_mon_resources(void)
+{
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+ rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ if (!rdt_mon_features)
+ return false;
+
+ return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init void rdt_quirks(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_FAM6_SKYLAKE_X:
+ if (boot_cpu_data.x86_mask <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_quirks();
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
return ret;
}
- for_each_capable_rdt_resource(r)
+ for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("Intel RDT %s monitoring detected\n", r->name);
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..ebaddaeef023
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+#define IA32_MBA_THRTL_BASE 0xd50
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
+
+#define CQM_LIMBOCHECK_INTERVAL 1000
+
+#define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL 1000
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ */
+struct mon_evt {
+ u32 evtid;
+ char *name;
+ struct list_head list;
+};
+
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid: Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+ void *priv;
+ struct {
+ unsigned int rid : 10;
+ unsigned int evtid : 8;
+ unsigned int domid : 14;
+ } u;
+};
+
+struct rmid_read {
+ struct rdtgroup *rgrp;
+ struct rdt_domain *d;
+ int evtid;
+ bool first;
+ u64 val;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+ RDTCTRL_GROUP = 0,
+ RDTMON_GROUP,
+ RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn kernlfs node for the mon_data directory
+ * @parent: parent rdtgrp
+ * @crdtgrp_list: child rdtgroup node list
+ * @rmid: rmid for this rdtgroup
+ */
+struct mongroup {
+ struct kernfs_node *mon_data_kn;
+ struct rdtgroup *parent;
+ struct list_head crdtgrp_list;
+ u32 rmid;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ * @type: indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon: mongroup related data
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO BIT(0)
+#define RFTYPE_BASE BIT(1)
+#define RF_CTRLSHIFT 4
+#define RF_MONSHIFT 5
+#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
+#define RFTYPE_RES_CACHE BIT(8)
+#define RFTYPE_RES_MB BIT(9)
+#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags: File specific RF_* or RFTYPE_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+ unsigned long flags;
+ unsigned long fflags;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over: worker to periodically read MBM h/w counters
+ * @cqm_limbo: worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ * worker cpu for CQM h/w counters
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl: new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len: Length of the cache bit mask
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @cbm_idx_mult: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ * in a cache bit mask
+ * @shareable_bits: Bitmask of shareable resource with other
+ * executing entities
+ */
+struct rdt_cache {
+ unsigned int cbm_len;
+ unsigned int min_cbm_bits;
+ unsigned int cbm_idx_mult;
+ unsigned int cbm_idx_offset;
+ unsigned int shareable_bits;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay: Max throttle delay. Delay is the hardware
+ * representation for memory bandwidth.
+ * @min_bw: Minimum memory bandwidth percentage user can request
+ * @bw_gran: Granularity at which the memory bandwidth is allocated
+ * @delay_linear: True if memory B/W delay is in linear scale
+ * @mb_map: Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+ u32 max_delay;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ u32 *mb_map;
+};
+
+static inline bool is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool is_mbm_total_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+ return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
+static inline bool is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid: The index of the resource
+ * @alloc_enabled: Is allocation enabled on this machine
+ * @mon_enabled: Is monitoring enabled for this feature
+ * @alloc_capable: Is allocation available on this machine
+ * @mon_capable: Is monitor feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @cache_level: Which cache level defines scope of this resource
+ * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @data_width: Character width of data when displaying
+ * @domains: All domains for this resource
+ * @cache: Cache allocation related data
+ * @format_str: Per resource format string to show domain value
+ * @parse_ctrlval: Per resource function pointer to parse control values
+ * @evt_list: List of monitoring events
+ * @num_rmid: Number of RMIDs available
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @fflags: flags to choose base and info files
+ */
+struct rdt_resource {
+ int rid;
+ bool alloc_enabled;
+ bool mon_enabled;
+ bool alloc_capable;
+ bool mon_capable;
+ char *name;
+ int num_closid;
+ int cache_level;
+ u32 default_ctrl;
+ unsigned int msr_base;
+ void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
+ struct rdt_resource *r);
+ int data_width;
+ struct list_head domains;
+ struct rdt_cache cache;
+ struct rdt_membw membw;
+ const char *format_str;
+ int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
+ struct rdt_domain *d);
+ struct list_head evt_list;
+ int num_rmid;
+ unsigned int mon_scale;
+ unsigned long fflags;
+};
+
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+ RDT_RESOURCE_MBA,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable || r->mon_capable)
+
+#define for_each_alloc_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable)
+
+#define for_each_mon_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_capable)
+
+#define for_each_alloc_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_enabled)
+
+#define for_each_mon_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 406d7a6532f9..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
#include <linux/kernfs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <asm/intel_rdt.h>
+#include "intel_rdt.h"
/*
* Check whether MBA bandwidth percentage value is correct. The value is
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
{
struct rdt_resource *r;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
if (!strcmp(resname, r->name) && closid < r->num_closid)
return parse_line(tok, r);
}
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
closid = rdtgrp->closid;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
list_for_each_entry(dom, &r->domains, list)
dom->have_new_ctrl = false;
}
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
ret = update_domains(r, closid);
if (ret)
goto out;
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
{
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- int closid, ret = 0;
+ int ret = 0;
+ u32 closid;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
closid = rdtgrp->closid;
- for_each_enabled_rdt_resource(r) {
+ for_each_alloc_enabled_rdt_resource(r) {
if (closid < r->num_closid)
show_doms(s, r, closid);
}
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
rdtgroup_kn_unlock(of->kn);
return ret;
}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first)
+{
+ /*
+ * setup the parameters to send to the IPI to read the data.
+ */
+ rr->rgrp = rdtgrp;
+ rr->evtid = evtid;
+ rr->d = d;
+ rr->val = 0;
+ rr->first = first;
+
+ smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ u32 resid, evtid, domid;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ union mon_data_bits md;
+ struct rdt_domain *d;
+ struct rmid_read rr;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ md.priv = of->kn->priv;
+ resid = md.u.rid;
+ domid = md.u.domid;
+ evtid = md.u.evtid;
+
+ r = &rdt_resources_all[resid];
+ d = rdt_find_domain(r, domid, NULL);
+ if (!d) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mon_event_read(&rr, d, rdtgrp, evtid, false);
+
+ if (rr.val & RMID_VAL_ERROR)
+ seq_puts(m, "Error\n");
+ else if (rr.val & RMID_VAL_UNAVAIL)
+ seq_puts(m, "Unavailable\n");
+ else
+ seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..30827510094b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+struct rmid_entry {
+ u32 rmid;
+ int busy;
+ struct list_head list;
+};
+
+/**
+ * @rmid_free_lru A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_count count of currently unused but (potentially)
+ * dirty RMIDs.
+ * This counts RMIDs that no one is currently using but that
+ * may have a occupancy value > intel_cqm_threshold. User can change
+ * the threshold occupancy value.
+ */
+unsigned int rmid_limbo_count;
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ entry = &rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+ u64 val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ return val;
+}
+
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+ u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+
+ return val >= intel_cqm_threshold;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+ struct rmid_entry *entry;
+ struct rdt_resource *r;
+ u32 crmid = 1, nrmid;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ /*
+ * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+ * are marked as busy for occupancy < threshold. If the occupancy
+ * is less than the threshold decrement the busy counter of the
+ * RMID and move it to the free list when the counter reaches 0.
+ */
+ for (;;) {
+ nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+ if (nrmid >= r->num_rmid)
+ break;
+
+ entry = __rmid_entry(nrmid);
+ if (force_free || !rmid_dirty(entry)) {
+ clear_bit(entry->rmid, d->rmid_busy_llc);
+ if (!--entry->busy) {
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+ }
+ crmid = nrmid + 1;
+ }
+}
+
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+ return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? -EBUSY : -ENOSPC;
+
+ entry = list_first_entry(&rmid_free_lru,
+ struct rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ int cpu;
+ u64 val;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ entry->busy = 0;
+ cpu = get_cpu();
+ list_for_each_entry(d, &r->domains, list) {
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ if (val <= intel_cqm_threshold)
+ continue;
+ }
+
+ /*
+ * For the first limbo RMID in the domain,
+ * setup up the limbo worker.
+ */
+ if (!has_busy_rmid(r, d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+ set_bit(entry->rmid, d->rmid_busy_llc);
+ entry->busy++;
+ }
+ put_cpu();
+
+ if (entry->busy)
+ rmid_limbo_count++;
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+void free_rmid(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ if (!rmid)
+ return;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ entry = __rmid_entry(rmid);
+
+ if (is_llc_occupancy_enabled())
+ add_rmid_to_limbo(entry);
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+ u64 chunks, shift, tval;
+ struct mbm_state *m;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+ rr->val = tval;
+ return -EINVAL;
+ }
+ switch (rr->evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ rr->val += tval;
+ return 0;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ m = &rr->d->mbm_total[rmid];
+ break;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ m = &rr->d->mbm_local[rmid];
+ break;
+ default:
+ /*
+ * Code would never reach here because
+ * an invalid event id would fail the __rmid_read.
+ */
+ return -EINVAL;
+ }
+
+ if (rr->first) {
+ m->prev_msr = tval;
+ m->chunks = 0;
+ return 0;
+ }
+
+ shift = 64 - MBM_CNTR_WIDTH;
+ chunks = (tval << shift) - (m->prev_msr << shift);
+ chunks >>= shift;
+ m->chunks += chunks;
+ m->prev_msr = tval;
+
+ rr->val += m->chunks;
+ return 0;
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+ struct rdtgroup *rdtgrp, *entry;
+ struct rmid_read *rr = info;
+ struct list_head *head;
+
+ rdtgrp = rr->rgrp;
+
+ if (__mon_event_count(rdtgrp->mon.rmid, rr))
+ return;
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ if (__mon_event_count(entry->mon.rmid, rr))
+ return;
+ }
+ }
+}
+
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+ struct rmid_read rr;
+
+ rr.first = false;
+ rr.d = d;
+
+ /*
+ * This is protected from concurrent reads from user
+ * as both the user and we hold the global mutex.
+ */
+ if (is_mbm_total_enabled()) {
+ rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+ if (is_mbm_local_enabled()) {
+ rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+ int cpu = smp_processor_id();
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ d = get_domain_from_cpu(cpu, r);
+
+ if (!d) {
+ pr_warn_once("Failure to get domain for limbo worker\n");
+ goto out_unlock;
+ }
+
+ __check_limbo(d, false);
+
+ if (has_busy_rmid(r, d))
+ schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ struct rdt_resource *r;
+ int cpu;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->cqm_work_cpu = cpu;
+
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+ struct rdtgroup *prgrp, *crgrp;
+ int cpu = smp_processor_id();
+ struct list_head *head;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (!static_branch_likely(&rdt_enable_key))
+ goto out_unlock;
+
+ d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (!d)
+ goto out_unlock;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mbm_update(d, prgrp->mon.rmid);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+ mbm_update(d, crgrp->mon.rmid);
+ }
+
+ schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ if (!static_branch_likely(&rdt_enable_key))
+ return;
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->mbm_work_cpu = cpu;
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ struct rmid_entry *entry = NULL;
+ int i, nr_rmids;
+
+ nr_rmids = r->num_rmid;
+ rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_rmids; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ entry->rmid = i;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+ if (is_mbm_total_enabled())
+ list_add_tail(&mbm_total_event.list, &r->evt_list);
+ if (is_mbm_local_enabled())
+ list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ int ret;
+
+ r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ intel_cqm_threshold /= r->mon_scale;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ r->mon_capable = true;
+ r->mon_enabled = true;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
#include <uapi/linux/magic.h>
-#include <asm/intel_rdt.h>
-#include <asm/intel_rdt_common.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -66,7 +74,7 @@ static void closid_init(void)
int rdt_min_closid = 32;
/* Compute rdt_min_closid across all resources */
- for_each_enabled_rdt_resource(r)
+ for_each_alloc_enabled_rdt_resource(r)
rdt_min_closid = min(rdt_min_closid, r->num_closid);
closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -75,9 +83,9 @@ static void closid_init(void)
closid_free_map &= ~1;
}
-int closid_alloc(void)
+static int closid_alloc(void)
{
- int closid = ffs(closid_free_map);
+ u32 closid = ffs(closid_free_map);
if (closid == 0)
return -ENOSPC;
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
return 0;
}
-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
- int len)
-{
- struct rftype *rft;
- int ret;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- ret = rdtgroup_add_file(kn, rft);
- if (ret)
- goto error;
- }
-
- return 0;
-error:
- pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
- while (--rft >= rfts)
- kernfs_remove_by_name(kn, rft->name);
- return ret;
-}
-
static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
.seq_show = rdtgroup_seqfile_show,
};
+static struct kernfs_ops kf_mondata_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .seq_show = rdtgroup_mondata_show,
+};
+
static bool is_cpu_list(struct kernfs_open_file *of)
{
struct rftype *rft = of->kn->priv;
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
/*
* This is safe against intel_rdt_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
* preemption is disabled.
*/
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid_rmid(void *info)
{
- if (closid)
- this_cpu_write(cpu_closid, *(int *)closid);
+ struct rdtgroup *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ }
+
/*
* We cannot unconditionally write the MSR because the current
* executing task might have its own closid selected. Just reuse
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
/*
* Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
*
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
*/
static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_update_cpu_closid(closid);
- smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+ update_cpu_closid_rmid(r);
+ smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
put_cpu();
}
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus belong to parent ctrl group */
+ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+ if (cpumask_weight(tmpmask))
+ return -EINVAL;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Give any dropped cpus to parent rdtgroup */
+ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, prgrp);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu rmid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ if (crgrp == rdtgrp)
+ continue;
+ cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+ tmpmask);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+ struct rdtgroup *crgrp;
+
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+ /* update the child mon group masks as well*/
+ list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+ cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+ struct rdtgroup *r, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default)
+ return -EINVAL;
+
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, &rdtgroup_default);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group and
+ * the prev group's child groups that owned them
+ * and update per-cpu closid/rmid.
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+ if (cpumask_weight(tmpmask1))
+ cpumask_rdtgrp_clear(r, tmpmask1);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ /*
+ * Clear child mon group masks since there is a new parent mask
+ * now and update the rmid for the cpus the child lost.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+ update_closid_rmid(tmpmask, rdtgrp);
+ cpumask_clear(&crgrp->cpu_mask);
+ }
+
+ return 0;
+}
+
static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- cpumask_var_t tmpmask, newmask;
- struct rdtgroup *rdtgrp, *r;
+ cpumask_var_t tmpmask, newmask, tmpmask1;
+ struct rdtgroup *rdtgrp;
int ret;
if (!buf)
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
free_cpumask_var(tmpmask);
return -ENOMEM;
}
+ if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ return -ENOMEM;
+ }
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
- /* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default) {
- ret = -EINVAL;
- goto unlock;
- }
- /* Give any dropped cpus to rdtgroup_default */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, tmpmask);
- rdt_update_closid(tmpmask, &rdtgroup_default.closid);
- }
-
- /*
- * If we added cpus, remove them from previous group that owned them
- * and update per-cpu closid
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
- list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
- if (r == rdtgrp)
- continue;
- cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
- }
- rdt_update_closid(tmpmask, &rdtgrp->closid);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+ else if (rdtgrp->type == RDTMON_GROUP)
+ ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+ else
+ ret = -EINVAL;
unlock:
rdtgroup_kn_unlock(of->kn);
free_cpumask_var(tmpmask);
free_cpumask_var(newmask);
+ free_cpumask_var(tmpmask1);
return ret ?: nbytes;
}
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
if (atomic_dec_and_test(&rdtgrp->waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
+ current->rmid = 0;
kfree(rdtgrp);
}
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
atomic_dec(&rdtgrp->waitcount);
kfree(callback);
} else {
- tsk->closid = rdtgrp->closid;
+ /*
+ * For ctrl_mon groups move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid)
+ tsk->rmid = rdtgrp->mon.rmid;
+ else
+ ret = -EINVAL;
+ }
}
return ret;
}
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
rcu_read_lock();
for_each_process_thread(p, t) {
- if (t->closid == r->closid)
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
seq_printf(s, "%d\n", t->pid);
}
rcu_read_unlock();
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
-/* Files in each rdtgroup */
-static struct rftype rdtgroup_base_files[] = {
- {
- .name = "cpus",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- },
- {
- .name = "cpus_list",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .flags = RFTYPE_FLAGS_CPUS_LIST,
- },
- {
- .name = "tasks",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_tasks_write,
- .seq_show = rdtgroup_tasks_show,
- },
- {
- .name = "schemata",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_schemata_write,
- .seq_show = rdtgroup_schemata_show,
- },
-};
-
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
return 0;
}
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->cache.shareable_bits);
+ return 0;
+}
+
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
return 0;
}
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_rmid);
+
+ return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ struct mon_evt *mevt;
+
+ list_for_each_entry(mevt, &r->evt_list, list)
+ seq_printf(seq, "%s\n", mevt->name);
+
+ return 0;
+}
+
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
return 0;
}
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+ return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+ return -EINVAL;
+
+ intel_cqm_threshold = bytes / r->mon_scale;
+
+ return nbytes;
+}
+
/* rdtgroup information files for one cache resource. */
-static struct rftype res_cache_info_files[] = {
+static struct rftype res_common_files[] = {
{
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_closids_show,
+ .fflags = RF_CTRL_INFO,
+ },
+ {
+ .name = "mon_features",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_mon_features_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "num_rmids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_rmids_show,
+ .fflags = RF_MON_INFO,
},
{
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_default_ctrl_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_cbm_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_cbm_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
-};
-
-/* rdtgroup information files for memory bandwidth. */
-static struct rftype res_mba_info_files[] = {
{
- .name = "num_closids",
+ .name = "shareable_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_closids_show,
+ .seq_show = rdt_shareable_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_bandwidth",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_bw_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "bandwidth_gran",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bw_gran_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "delay_linear",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_delay_linear_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "max_threshold_occupancy",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = max_threshold_occ_write,
+ .seq_show = max_threshold_occ_show,
+ .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ .fflags = RF_CTRL_BASE,
},
};
-void rdt_get_mba_infofile(struct rdt_resource *r)
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
{
- r->info_files = res_mba_info_files;
- r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if ((fflags & rft->fflags) == rft->fflags) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts) {
+ if ((fflags & rft->fflags) == rft->fflags)
+ kernfs_remove_by_name(kn, rft->name);
+ }
+ return ret;
}
-void rdt_get_cache_infofile(struct rdt_resource *r)
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+ unsigned long fflags)
{
- r->info_files = res_cache_info_files;
- r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+ struct kernfs_node *kn_subdir;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(kn_info, name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_add_files(kn_subdir, fflags);
+ if (!ret)
+ kernfs_activate(kn_subdir);
+
+ return ret;
}
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
- struct kernfs_node *kn_subdir;
- struct rftype *res_info_files;
struct rdt_resource *r;
- int ret, len;
+ unsigned long fflags;
+ char name[32];
+ int ret;
/* create the directory */
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
return PTR_ERR(kn_info);
kernfs_get(kn_info);
- for_each_enabled_rdt_resource(r) {
- kn_subdir = kernfs_create_dir(kn_info, r->name,
- kn_info->mode, r);
- if (IS_ERR(kn_subdir)) {
- ret = PTR_ERR(kn_subdir);
- goto out_destroy;
- }
- kernfs_get(kn_subdir);
- ret = rdtgroup_kn_set_ugid(kn_subdir);
+ for_each_alloc_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_CTRL_INFO;
+ ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
if (ret)
goto out_destroy;
+ }
- res_info_files = r->info_files;
- len = r->nr_info_files;
-
- ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
+ for_each_mon_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_MON_INFO;
+ sprintf(name, "%s_MON", r->name);
+ ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
goto out_destroy;
- kernfs_activate(kn_subdir);
}
/*
@@ -678,6 +889,39 @@ out_destroy:
return ret;
}
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+ char *name, struct kernfs_node **dest_kn)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
@@ -718,14 +962,15 @@ static int cdp_enable(void)
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
int ret;
- if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+ if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
+ !r_l3code->alloc_capable)
return -EINVAL;
ret = set_l3_qos_cfg(r_l3, true);
if (!ret) {
- r_l3->enabled = false;
- r_l3data->enabled = true;
- r_l3code->enabled = true;
+ r_l3->alloc_enabled = false;
+ r_l3data->alloc_enabled = true;
+ r_l3code->alloc_enabled = true;
}
return ret;
}
@@ -734,11 +979,11 @@ static void cdp_disable(void)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
- r->enabled = r->capable;
+ r->alloc_enabled = r->alloc_capable;
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
- rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
- rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
+ rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
+ rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
set_l3_qos_cfg(r, false);
}
}
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
}
}
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **mon_data_kn);
+
static struct dentry *rdt_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
struct dentry *dentry;
int ret;
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
goto out_cdp;
}
+ if (rdt_mon_capable) {
+ ret = mongroup_create_dir(rdtgroup_default.kn,
+ NULL, "mon_groups",
+ &kn_mongrp);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_info;
+ }
+ kernfs_get(kn_mongrp);
+
+ ret = mkdir_mondata_all(rdtgroup_default.kn,
+ &rdtgroup_default, &kn_mondata);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mongrp;
+ }
+ kernfs_get(kn_mondata);
+ rdtgroup_default.mon.mon_data_kn = kn_mondata;
+ }
+
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_destroy;
+ goto out_mondata;
+
+ if (rdt_alloc_capable)
+ static_branch_enable(&rdt_alloc_enable_key);
+ if (rdt_mon_capable)
+ static_branch_enable(&rdt_mon_enable_key);
+
+ if (rdt_alloc_capable || rdt_mon_capable)
+ static_branch_enable(&rdt_enable_key);
+
+ if (is_mbm_enabled()) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ list_for_each_entry(dom, &r->domains, list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ }
- static_branch_enable(&rdt_enable_key);
goto out;
-out_destroy:
+out_mondata:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mondata);
+out_mongrp:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mongrp);
+out_info:
kernfs_remove(kn_info);
out_cdp:
cdp_disable();
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
return 0;
}
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_alloc_capable &&
+ (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_mon_capable &&
+ (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
/*
* Move tasks from one to the other group. If @from is NULL, then all tasks
* in the systems are moved unconditionally (used for teardown).
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- if (!from || t->closid == from->closid) {
+ if (!from || is_closid_match(t, from) ||
+ is_rmid_match(t, from)) {
t->closid = to->closid;
+ t->rmid = to->mon.rmid;
+
#ifdef CONFIG_SMP
/*
* This is safe on x86 w/o barriers as the ordering
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
read_unlock(&tasklist_lock);
}
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+ struct rdtgroup *sentry, *stmp;
+ struct list_head *head;
+
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ free_rmid(sentry->mon.rmid);
+ list_del(&sentry->mon.crdtgrp_list);
+ kfree(sentry);
+ }
+}
+
/*
* Forcibly remove all of subdirectories under root.
*/
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Free any child rmids */
+ free_all_child_rdtgrp(rdtgrp);
+
/* Remove each rdtgroup other than root */
if (rdtgrp == &rdtgroup_default)
continue;
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+ free_rmid(rdtgrp->mon.rmid);
+
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
get_online_cpus();
- rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+ update_closid_rmid(cpu_online_mask, &rdtgroup_default);
put_online_cpus();
kernfs_remove(kn_info);
+ kernfs_remove(kn_mongrp);
+ kernfs_remove(kn_mondata);
}
static void rdt_kill_sb(struct super_block *sb)
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
mutex_lock(&rdtgroup_mutex);
/*Put everything back to default values. */
- for_each_enabled_rdt_resource(r)
+ for_each_alloc_enabled_rdt_resource(r)
reset_all_ctrls(r);
cdp_disable();
rmdir_all_sub();
+ static_branch_disable(&rdt_alloc_enable_key);
+ static_branch_disable(&rdt_mon_enable_key);
static_branch_disable(&rdt_enable_key);
kernfs_kill_sb(sb);
mutex_unlock(&rdtgroup_mutex);
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
.kill_sb = rdt_kill_sb,
};
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+ void *priv)
{
- struct rdtgroup *parent, *rdtgrp;
struct kernfs_node *kn;
- int ret, closid;
+ int ret = 0;
- /* Only allow mkdir in the root directory */
- if (parent_kn != rdtgroup_default.kn)
- return -EPERM;
+ kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+ &kf_mondata_ops, priv, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
- /* Do not accept '\n' to avoid unparsable situation. */
- if (strchr(name, '\n'))
- return -EINVAL;
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
- parent = rdtgroup_kn_lock_live(parent_kn);
- if (!parent) {
- ret = -ENODEV;
- goto out_unlock;
+ return ret;
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+ struct rdtgroup *prgrp, *crgrp;
+ char name[32];
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ sprintf(name, "mon_%s_%02d", r->name, dom_id);
+ kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
}
+}
- ret = closid_alloc();
- if (ret < 0)
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ union mon_data_bits priv;
+ struct kernfs_node *kn;
+ struct mon_evt *mevt;
+ struct rmid_read rr;
+ char name[32];
+ int ret;
+
+ sprintf(name, "mon_%s_%02d", r->name, d->id);
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that kn is always accessible.
+ */
+ kernfs_get(kn);
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (WARN_ON(list_empty(&r->evt_list))) {
+ ret = -EPERM;
+ goto out_destroy;
+ }
+
+ priv.u.rid = r->rid;
+ priv.u.domid = d->id;
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ priv.u.evtid = mevt->evtid;
+ ret = mon_addfile(kn, mevt->name, priv.priv);
+ if (ret)
+ goto out_destroy;
+
+ if (is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+ }
+ kernfs_activate(kn);
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *prgrp, *crgrp;
+ struct list_head *head;
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ parent_kn = prgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ parent_kn = crgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+ }
+ }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+ struct rdt_resource *r,
+ struct rdtgroup *prgrp)
+{
+ struct rdt_domain *dom;
+ int ret;
+
+ list_for_each_entry(dom, &r->domains, list) {
+ ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **dest_kn)
+{
+ struct rdt_resource *r;
+ struct kernfs_node *kn;
+ int ret;
+
+ /*
+ * Create the mon_data directory first.
+ */
+ ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ if (ret)
+ return ret;
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * Create the subdirectories for each domain. Note that all events
+ * in a domain like L3 are grouped into a resource whose domain is L3
+ */
+ for_each_mon_enabled_rdt_resource(r) {
+ ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+ if (ret)
+ goto out_destroy;
+ }
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode,
+ enum rdt_group_type rtype, struct rdtgroup **r)
+{
+ struct rdtgroup *prdtgrp, *rdtgrp;
+ struct kernfs_node *kn;
+ uint files = 0;
+ int ret;
+
+ prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+ if (!prdtgrp) {
+ ret = -ENODEV;
goto out_unlock;
- closid = ret;
+ }
/* allocate the rdtgroup. */
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
ret = -ENOSPC;
- goto out_closid_free;
+ goto out_unlock;
}
- rdtgrp->closid = closid;
- list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+ *r = rdtgrp;
+ rdtgrp->mon.parent = prdtgrp;
+ rdtgrp->type = rtype;
+ INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
/* kernfs creates the directory for rdtgrp */
- kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+ kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_cancel_ref;
+ goto out_free_rgrp;
}
rdtgrp->kn = kn;
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- ret = rdtgroup_add_files(kn, rdtgroup_base_files,
- ARRAY_SIZE(rdtgroup_base_files));
+ files = RFTYPE_BASE | RFTYPE_CTRL;
+ files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ ret = rdtgroup_add_files(kn, files);
if (ret)
goto out_destroy;
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0)
+ goto out_destroy;
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret)
+ goto out_idfree;
+ }
kernfs_activate(kn);
- ret = 0;
- goto out_unlock;
+ /*
+ * The caller unlocks the prgrp_kn upon success.
+ */
+ return 0;
+out_idfree:
+ free_rmid(rdtgrp->mon.rmid);
out_destroy:
kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
- list_del(&rdtgrp->rdtgroup_list);
+out_free_rgrp:
kfree(rdtgrp);
-out_closid_free:
- closid_free(closid);
out_unlock:
- rdtgroup_kn_unlock(parent_kn);
+ rdtgroup_kn_unlock(prgrp_kn);
return ret;
}
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+ kernfs_remove(rgrp->kn);
+ free_rmid(rgrp->mon.rmid);
+ kfree(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *rdtgrp, *prgrp;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ prgrp = rdtgrp->mon.parent;
+ rdtgrp->closid = prgrp->closid;
+
+ /*
+ * Add the rdtgrp to the list of rdtgrps the parent
+ * ctrl_mon group has to track.
+ */
+ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode)
{
- int ret, cpu, closid = rdtgroup_default.closid;
struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
+ struct kernfs_node *kn;
+ u32 closid;
+ int ret;
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
- rdtgrp = rdtgroup_kn_lock_live(kn);
- if (!rdtgrp) {
- ret = -EPERM;
- goto out;
+ kn = rdtgrp->kn;
+ ret = closid_alloc();
+ if (ret < 0)
+ goto out_common_fail;
+ closid = ret;
+
+ rdtgrp->closid = closid;
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ if (rdt_mon_capable) {
+ /*
+ * Create an empty mon_groups directory to hold the subset
+ * of tasks and cpus to monitor.
+ */
+ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ if (ret)
+ goto out_id_free;
}
+ goto out_unlock;
+
+out_id_free:
+ closid_free(closid);
+ list_del(&rdtgrp->rdtgroup_list);
+out_common_fail:
+ mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(kn->name, "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ /*
+ * If the parent directory is the root directory and RDT
+ * allocation is supported, add a control and monitoring
+ * subdirectory
+ */
+ if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+ /*
+ * If RDT monitoring is supported and the parent directory is a valid
+ * "mon_groups" directory, add a monitoring subdirectory.
+ */
+ if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+
+ return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ int cpu;
+
+ /* Give any tasks back to the parent group */
+ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+ /* Update per cpu rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Remove the rdtgrp from the parent ctrl_mon group's list
+ */
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_del(&rdtgrp->mon.crdtgrp_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+
+ return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ int cpu;
+
/* Give any tasks back to the default group */
rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
- /* Update per cpu closid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(cpu_closid, cpu) = closid;
+ /* Update per cpu closid and rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask) {
+ per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+ per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+ }
+
/*
* Update the MSR on moved CPUs and CPUs which have moved
* task running on them.
*/
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- rdt_update_closid(tmpmask, NULL);
+ update_closid_rmid(tmpmask, NULL);
rdtgrp->flags = RDT_DELETED;
closid_free(rdtgrp->closid);
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Free all the child monitor group rmids.
+ */
+ free_all_child_rdtgrp(rdtgrp);
+
list_del(&rdtgrp->rdtgroup_list);
/*
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
*/
kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
- ret = 0;
+
+ return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent_kn = kn->parent;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the rdtgroup is a ctrl_mon group and parent directory
+ * is the root directory, remove the ctrl_mon group.
+ *
+ * If the rdtgroup is a mon group and parent directory
+ * is a valid "mon_groups" directory, remove the mon group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+ ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, kn->name))
+ ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ else
+ ret = -EPERM;
+
out:
rdtgroup_kn_unlock(kn);
free_cpumask_var(tmpmask);
@@ -1129,7 +1845,7 @@ out:
static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
seq_puts(seq, ",cdp");
return 0;
}
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
mutex_lock(&rdtgroup_mutex);
rdtgroup_default.closid = 0;
+ rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.type = RDTCTRL_GROUP;
+ INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
- ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
- ARRAY_SIZE(rdtgroup_base_files));
+ ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
if (ret) {
kernfs_destroy_root(rdt_root);
goto out;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9e314bcf67cc..40e28ed77fbf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
- /* Collect bank_info using CPU 0 for now. */
- if (cpu)
+ /* Return early if this bank was already initialized. */
+ if (smca_banks[bank].hwid)
return;
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
@@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (hwid_mcatype == s_hwid->hwid_mcatype) {
-
- WARN(smca_banks[bank].hwid,
- "Bank %s already initialized!\n",
- smca_get_name(s_hwid->bank_type));
-
smca_banks[bank].hwid = s_hwid;
smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
@@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
mce_log(&m);
}
-static inline void __smp_deferred_error_interrupt(void)
-{
- inc_irq_stat(irq_deferred_error_count);
- deferred_error_int_vector();
-}
-
asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
{
entering_irq();
- __smp_deferred_error_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
-{
- entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
- __smp_deferred_error_interrupt();
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index f7370abd33c6..2da67b70ba98 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-static inline void __smp_thermal_interrupt(void)
-{
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
-}
-
-asmlinkage __visible void __irq_entry
-smp_thermal_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_thermal_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry
-smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- __smp_thermal_interrupt();
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index bb0e75eed10a..5e7249e42f8f 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,24 +17,12 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-static inline void __smp_threshold_interrupt(void)
-{
- inc_irq_stat(irq_threshold_count);
- mce_threshold_vector();
-}
-
asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
{
entering_irq();
- __smp_threshold_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
-{
- entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
- __smp_threshold_interrupt();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 21b185793c80..c6daec4bdba5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch)
list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id)
+ if (p->patch_id >= new_patch->patch_id) {
/* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
return;
+ }
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 59edbe9d4ccb..8f7a9bbad514 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header,
return false;
}
-static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
if (!p)
- return ERR_PTR(-ENOMEM);
+ return NULL;
p->data = kmemdup(data, size, GFP_KERNEL);
if (!p->data) {
kfree(p);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
return p;
@@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size)
if (mc_hdr->rev <= mc_saved_hdr->rev)
continue;
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer %p\n", data);
else
list_replace(&iter->plist, &p->plist);
@@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size)
* newly found.
*/
if (!prev_found) {
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer for %p\n", data);
else
list_add_tail(&p->plist, &microcode_cache);
}
+ if (!p)
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
* paging has been enabled.
*/
- if (p) {
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
- else
- intel_ucode_patch = p->data;
- }
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
}
static int microcode_sanity_check(void *mc, int print_err)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index daefd67a66c7..3b3f713e15e5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs)
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
- /*
- * Setup the IDT for hypervisor callback. Prevent reallocation
- * at module reload.
- */
- if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
- hyperv_callback_vector);
+ /* Setup the IDT for hypervisor callback */
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
@@ -184,9 +179,15 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- pr_info("HyperV: features 0x%x, hints 0x%x\n",
+ pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
+ ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
/*
* Extract host information.
*/
@@ -219,7 +220,7 @@ static void __init ms_hyperv_init_platform(void)
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_frequency = hv_lapic_frequency;
- pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
}
@@ -254,7 +255,7 @@ static void __init ms_hyperv_init_platform(void)
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
- .name = "Microsoft HyperV",
+ .name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
.init_platform = ms_hyperv_init_platform,
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..f13b4c00a5de 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+ __show_regs(regs, 0);
+
/*
* Scan the stack, printing any text addresses we find. At the
* same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
*/
- if (regs && stack == &regs->ip) {
- unwind_next_frame(&state);
- continue;
- }
+ if (regs && stack == &regs->ip)
+ goto next;
if (stack == ret_addr_p)
reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (!reliable)
continue;
+next:
/*
* Get the next frame from the unwinder. No need to
* check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
- if (regs)
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
}
@@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_X86_32
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
} else {
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
begin = end - (exception_stack_sizes[k] / sizeof(long));
regs = (struct pt_regs *)end - 1;
- if (stack < begin || stack >= end)
+ if (stack <= begin || stack >= end)
continue;
info->type = STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..927abeaf63e2 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,10 +12,10 @@
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/delay.h>
-#include <linux/dmi.h>
#include <linux/pci_ids.h>
#include <linux/bcma/bcma.h>
#include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
@@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
};
static void __init
@@ -593,7 +594,7 @@ static void __init apple_airport_reset(int bus, int slot, int func)
u64 addr;
int i;
- if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+ if (!x86_apple_machine)
return;
/* Card may have been put into PCI_D3hot by grub quirk */
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000000..f260e452e4f8
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,19 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+static __init int eisa_bus_probe(void)
+{
+ void __iomem *p = ioremap(0x0FFFD9, 4);
+
+ if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
+ EISA_bus = 1;
+ iounmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 538ec012b371..cf2ce063f65a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/memblock.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/e820/api.h>
@@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
cr4_init_shadow();
+
+ idt_setup_early_handler();
+
sanitize_boot_params(&boot_params);
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6a193b93fd95..bab4fa579450 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -311,8 +311,6 @@ static void __init copy_bootdata(char *real_mode_data)
asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
- int i;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
@@ -345,9 +343,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
kasan_early_init();
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handler_array[i]);
- load_idt((const struct desc_ptr *)&idt_descr);
+ idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..9ed3074d0d27 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
-WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
subarch_entries:
.long .Ldefault_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
@@ -347,7 +345,6 @@ ENTRY(startup_32_smp)
movl %eax,%cr0
lgdt early_gdt_descr
- lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
@@ -380,37 +377,6 @@ ENDPROC(startup_32_smp)
*/
__INIT
setup_once:
- /*
- * Set up a idt with 256 interrupt gates that push zero if there
- * is no error code and then jump to early_idt_handler_common.
- * It doesn't actually load the idt - that needs to be done on
- * each CPU. Interrupts are enabled elsewhere, when we can be
- * relatively sure everything is ok.
- */
-
- movl $idt_table,%edi
- movl $early_idt_handler_array,%eax
- movl $NUM_EXCEPTION_VECTORS,%ecx
-1:
- movl %eax,(%edi)
- movl %eax,4(%edi)
- /* interrupt gate, dpl=0, present */
- movl $(0x8E000000 + __KERNEL_CS),2(%edi)
- addl $EARLY_IDT_HANDLER_SIZE,%eax
- addl $8,%edi
- loop 1b
-
- movl $256 - NUM_EXCEPTION_VECTORS,%ecx
- movl $ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
-2:
- movl %eax,(%edi)
- movl %edx,4(%edi)
- addl $8,%edi
- loop 2b
-
#ifdef CONFIG_CC_STACKPROTECTOR
/*
* Configure the stack canary. The linker can't handle this by
@@ -457,12 +423,9 @@ early_idt_handler_common:
/* The vector number is in pt_regs->gs */
cld
- pushl %fs /* pt_regs->fs */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %es /* pt_regs->es */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %ds /* pt_regs->ds */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
pushl %eax /* pt_regs->ax */
pushl %ebp /* pt_regs->bp */
pushl %edi /* pt_regs->di */
@@ -479,9 +442,8 @@ early_idt_handler_common:
/* Load the vector number into EDX */
movl PT_GS(%esp), %edx
- /* Load GS into pt_regs->gs and clear high bits */
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
movw %gs, PT_GS(%esp)
- movw $0, PT_GS+2(%esp)
movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
call early_fixup_exception
@@ -493,18 +455,17 @@ early_idt_handler_common:
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
popl %eax /* pt_regs->ax */
- popl %ds /* pt_regs->ds */
- popl %es /* pt_regs->es */
- popl %fs /* pt_regs->fs */
- popl %gs /* pt_regs->gs */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
decl %ss:early_recursion_flag
addl $4, %esp /* pop pt_regs->orig_ax */
iret
ENDPROC(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
- ALIGN
-ignore_int:
+ENTRY(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -539,7 +500,8 @@ ignore_int:
hlt_loop:
hlt
jmp hlt_loop
-ENDPROC(ignore_int)
+ENDPROC(early_ignore_irq)
+
__INITDATA
.align 4
GLOBAL(early_recursion_flag)
@@ -628,7 +590,6 @@ int_msg:
.data
.globl boot_gdt_descr
-.globl idt_descr
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
@@ -637,11 +598,6 @@ boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
- .word 0 # 32-bit align idt_desc.address
-idt_descr:
- .word IDT_ENTRIES*8-1 # idt contains 256 entries
- .long idt_table
-
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000000..6107ee1cb8d5
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,371 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/interrupt.h>
+
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate with interrupt stack */
+#define SISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ SYSG(X86_TRAP_BP, int3),
+#ifdef CONFIG_X86_32
+ INTG(X86_TRAP_PF, page_fault),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initdata struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, divide_error),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BR, bounds),
+ INTG(X86_TRAP_UD, invalid_op),
+ INTG(X86_TRAP_NM, device_not_available),
+ INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
+ INTG(X86_TRAP_TS, invalid_TSS),
+ INTG(X86_TRAP_NP, segment_not_present),
+ INTG(X86_TRAP_SS, stack_segment),
+ INTG(X86_TRAP_GP, general_protection),
+ INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, coprocessor_error),
+ INTG(X86_TRAP_AC, alignment_check),
+ INTG(X86_TRAP_XF, simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ INTG(X86_TRAP_DF, double_fault),
+#endif
+ INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BP, int3),
+
+#ifdef CONFIG_X86_MCE
+ INTG(X86_TRAP_MC, &machine_check),
+#endif
+
+ SYSG(X86_TRAP_OF, overflow),
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initdata struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
+ INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
+ INTG(REBOOT_VECTOR, reboot_interrupt),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(ERROR_APIC_VECTOR, error_interrupt),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, page_fault),
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initdata struct idt_data dbg_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_BP, int3),
+};
+#endif
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .address = (unsigned long) idt_table,
+};
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initdata struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
+ ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
+ SISTG(X86_TRAP_BP, int3, DEBUG_STACK),
+ ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, &machine_check, MCE_STACK),
+#endif
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+const struct desc_ptr debug_idt_descr = {
+ .size = IDT_ENTRIES * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
+#endif
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
+}
+
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, used_vectors);
+ }
+}
+
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ BUG_ON(n > 0xFF);
+
+ memset(&data, 0, sizeof(data));
+ data.vector = n;
+ data.addr = addr;
+ data.segment = __KERNEL_CS;
+ data.bits.type = GATE_INTERRUPT;
+ data.bits.p = 1;
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+
+/**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+ idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
+}
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+ memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+
+ idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+}
+#endif
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ set_bit(i, used_vectors);
+ set_intr_gate(i, spurious_interrupt);
+#else
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+#endif
+ }
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr: The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+ struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init update_intr_gate(unsigned int n, const void *addr)
+{
+ if (WARN_ON_ONCE(!test_bit(n, used_vectors)))
+ return;
+ set_intr_gate(n, addr);
+}
+
+void alloc_intr_gate(unsigned int n, const void *addr)
+{
+ BUG_ON(n < FIRST_SYSTEM_VECTOR);
+ if (!test_and_set_bit(n, used_vectors))
+ set_intr_gate(n, addr);
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4ed0aba8dbc8..52089c043160 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
atomic_t irq_err_count;
-/* Function pointer for generic interrupt vector handling */
-void (*x86_platform_ipi_callback)(void) = NULL;
-
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
seq_puts(p, " APIC ICR read retries\n");
-#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
seq_puts(p, " Platform interrupts\n");
}
+#endif
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
@@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
sum += irq_stats(cpu)->icr_read_retry_count;
-#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -259,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
return 1;
}
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
-void __smp_x86_platform_ipi(void)
-{
- inc_irq_stat(x86_platform_ipis);
-
- if (x86_platform_ipi_callback)
- x86_platform_ipi_callback();
-}
-
__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
- __smp_x86_platform_ipi();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
exiting_irq();
set_irq_regs(old_regs);
}
+#endif
#ifdef CONFIG_HAVE_KVM
static void dummy_handler(void) {}
@@ -334,19 +331,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
}
#endif
-__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
- trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
- __smp_x86_platform_ipi();
- trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
- exiting_irq();
- set_irq_regs(old_regs);
-}
-
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
#ifdef CONFIG_HOTPLUG_CPU
@@ -431,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void)
* this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
- vector < first_system_vector; vector++) {
+ vector < FIRST_SYSTEM_VECTOR; vector++) {
if (!test_bit(vector, used_vectors) &&
IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
if (++count == this_count)
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 275487872be2..70dee056f92b 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -11,35 +11,23 @@
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>
-static inline void __smp_irq_work_interrupt(void)
-{
- inc_irq_stat(apic_irq_work_irqs);
- irq_work_run();
-}
-
+#ifdef CONFIG_X86_LOCAL_APIC
__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
- __smp_irq_work_interrupt();
- exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
- __smp_irq_work_interrupt();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
trace_irq_work_exit(IRQ_WORK_VECTOR);
exiting_irq();
}
void arch_irq_work_raise(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
if (!arch_irq_work_has_interrupt())
return;
apic->send_IPI_self(IRQ_WORK_VECTOR);
apic_wait_icr_idle();
-#endif
}
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c7fd18526c3e..1add9e08e83e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
- return 1;
- }
-
- return 0;
-}
-
void __init init_ISA_irqs(void)
{
struct irq_chip *chip = legacy_pic->chip;
@@ -99,100 +87,12 @@ void __init init_IRQ(void)
x86_init.irqs.intr_init();
}
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-
- /* IPI used for rebooting/stopping */
- alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#ifdef CONFIG_X86_MCE_THRESHOLD
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
- alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI for X86 platform specific use */
- alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
-#ifdef CONFIG_HAVE_KVM
- /* IPI for KVM to deliver posted interrupt */
- alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
- /* IPI for KVM to deliver interrupt to wake up tasks */
- alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
- /* IPI for KVM to deliver nested posted interrupt */
- alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi);
-#endif
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* IRQ work interrupts: */
-# ifdef CONFIG_IRQ_WORK
- alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
-# endif
-
-#endif
-}
-
void __init native_init_IRQ(void)
{
- int i;
-
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
- apic_intr_init();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- i = FIRST_EXTERNAL_VECTOR;
-#ifndef CONFIG_X86_LOCAL_APIC
-#define first_system_vector NR_VECTORS
-#endif
- for_each_clear_bit_from(i, used_vectors, first_system_vector) {
- /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, irq_entries_start +
- 8 * (i - FIRST_EXTERNAL_VECTOR));
- }
-#ifdef CONFIG_X86_LOCAL_APIC
- for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
- set_intr_gate(i, spurious_interrupt);
-#endif
+ idt_setup_apic_and_irq_gates();
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
+#include <asm/sections.h>
#include "common.h"
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
/*
* Do not optimize in the entry code due to the unstable
- * stack handling.
+ * stack handling and registers setup.
*/
- if ((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end))
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d04e30e3c0ff..874827b0d7ca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
switch (kvm_read_and_reset_pf_reason()) {
default:
- trace_do_page_fault(regs, error_code);
+ do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
@@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, async_page_fault);
+ update_intr_gate(X86_TRAP_PF, async_page_fault);
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *__mm)
{
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+ refresh_ldt_segments();
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8c53c5d7a1bc..00bc751c861c 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -26,18 +26,6 @@
#include <asm/set_memory.h>
#include <asm/debugreg.h>
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
static void set_gdt(void *newgdt, __u16 limit)
{
struct desc_ptr curgdt;
@@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image)
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ idt_invalidate(phys_to_virt(0));
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
+#include <asm/unwind.h>
#if 0
#define DEBUGP(fmt, ...) \
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
}
if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
/* make jump label nops */
jump_label_apply_nops(me);
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
return 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 446c8aa09b9b..35aafc95e4b8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -39,26 +39,26 @@
#include <trace/events/nmi.h>
struct nmi_desc {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct list_head head;
};
static struct nmi_desc nmi_desc[NMI_MAX] =
{
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
.head = LIST_HEAD_INIT(nmi_desc[0].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
.head = LIST_HEAD_INIT(nmi_desc[2].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
.head = LIST_HEAD_INIT(nmi_desc[3].head),
},
@@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
init_irq_work(&action->irq_work, nmi_max_handler);
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
/*
* Indicate if there are multiple registrations on the
@@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
else
list_add_tail_rcu(&action->list, &desc->head);
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(__register_nmi_handler);
@@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
struct nmiaction *n;
unsigned long flags;
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
list_for_each_entry_rcu(n, &desc->head, list) {
/*
@@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
}
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849589bb..a14df9eecfed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = {
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
-#endif
};
__visible struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
- case X86_SUBARCH_LGUEST:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
#include <asm/proto.h>
void __show_regs(struct pt_regs *regs, int all)
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
gs = get_user_gs(regs);
} else {
sp = kernel_stack_pointer(regs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
#include <asm/unistd.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
}
}
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- unsigned prev_fsindex, prev_gsindex;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
switch_fpu_prepare(prev_fpu, cpu);
@@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
- savesegment(fs, prev_fsindex);
- savesegment(gs, prev_gsindex);
+ save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- /*
- * Switch FS and GS.
- *
- * These are even more complicated than DS and ES: they have
- * 64-bit bases are that controlled by arch_prctl. The bases
- * don't necessarily match the selectors, as user code can do
- * any number of things to cause them to be inconsistent.
- *
- * We don't promise to preserve the bases if the selectors are
- * nonzero. We also don't promise to preserve the base if the
- * selector is zero and the base doesn't match whatever was
- * most recently passed to ARCH_SET_FS/GS. (If/when the
- * FSGSBASE instructions are enabled, we'll need to offer
- * stronger guarantees.)
- *
- * As an invariant,
- * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
- * impossible.
- */
- if (next->fsindex) {
- /* Loading a nonzero value into FS sets the index and base. */
- loadsegment(fs, next->fsindex);
- } else {
- if (next->fsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_fsindex)
- loadsegment(fs, 0);
- wrmsrl(MSR_FS_BASE, next->fsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- */
- loadsegment(fs, __USER_DS);
- loadsegment(fs, 0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_FS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->fsbase || prev_fsindex)
- loadsegment(fs, 0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_fsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_fsindex)
- prev->fsbase = 0;
- prev->fsindex = prev_fsindex;
-
- if (next->gsindex) {
- /* Loading a nonzero value into GS sets the index and base. */
- load_gs_index(next->gsindex);
- } else {
- if (next->gsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_gsindex)
- load_gs_index(0);
- wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- *
- * This contains a pointless SWAPGS pair.
- * Fixing it would involve an explicit check
- * for Xen or a new pvop.
- */
- load_gs_index(__USER_DS);
- load_gs_index(0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_GS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->gsbase || prev_gsindex)
- load_gs_index(0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_gsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_gsindex)
- prev->gsbase = 0;
- prev->gsindex = prev_gsindex;
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, cpu);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 0bee04d41bed..eaa591cfd98b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -1,6 +1,7 @@
/*
* This file contains work-arounds for x86 and x86_64 platform bugs.
*/
+#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/irq.h>
@@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
#endif
#endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+ x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+ dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a56bf6051f4e..54984b142641 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -38,8 +38,6 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static const struct desc_ptr no_idt = {};
-
/*
* This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
@@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_TRIPLE:
- load_idt(&no_idt);
+ idt_invalidate(NULL);
__asm__ __volatile__("int3");
/* We're probably dead after this, but... */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0bfe0c1628f6..d84afb0a322d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -116,6 +116,7 @@
#include <asm/microcode.h>
#include <asm/mmu_context.h>
#include <asm/kaslr.h>
+#include <asm/unwind.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -899,7 +900,7 @@ void __init setup_arch(char **cmdline_p)
*/
olpc_ofw_detect();
- early_trap_init();
+ idt_setup_early_traps();
early_cpu_init();
early_ioremap_init();
@@ -1170,7 +1171,7 @@ void __init setup_arch(char **cmdline_p)
init_mem_mapping();
- early_trap_pf_init();
+ idt_setup_early_pf();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
@@ -1215,6 +1216,8 @@ void __init setup_arch(char **cmdline_p)
io_delay_init();
+ early_platform_quirks();
+
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
*/
@@ -1319,6 +1322,8 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_apply_memmap_quirks();
#endif
+
+ unwind_init();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 10edd1e69a68..6e8fcb6f7e1e 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct gdt;
+ struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+ 0xFFFFF);
- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_rw(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc30a74e4adb..e04442345fc0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = current->sas_ss_sp + current->sas_ss_size;
} else if (IS_ENABLED(CONFIG_X86_32) &&
!onsigstack &&
- (regs->ss & 0xffff) != __USER_DS &&
+ regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
/* This is the legacy signal stack switching. */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d798c0da451c..5c574dff4c1a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -254,84 +254,45 @@ finish:
}
/*
- * Reschedule call back.
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode
*/
-static inline void __smp_reschedule_interrupt(void)
-{
- inc_irq_stat(irq_resched_count);
- scheduler_ipi();
-}
-
__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
- __smp_reschedule_interrupt();
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-}
-
-__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
-{
- /*
- * Need to call irq_enter() before calling the trace point.
- * __smp_reschedule_interrupt() calls irq_enter/exit() too (in
- * scheduler_ipi(). This is OK, since those functions are allowed
- * to nest.
- */
- ipi_entering_ack_irq();
- trace_reschedule_entry(RESCHEDULE_VECTOR);
- __smp_reschedule_interrupt();
- trace_reschedule_exit(RESCHEDULE_VECTOR);
- exiting_irq();
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-}
+ inc_irq_stat(irq_resched_count);
-static inline void __smp_call_function_interrupt(void)
-{
- generic_smp_call_function_interrupt();
- inc_irq_stat(irq_call_count);
+ if (trace_resched_ipi_enabled()) {
+ /*
+ * scheduler_ipi() might call irq_enter() as well, but
+ * nested calls are fine.
+ */
+ irq_enter();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
+ irq_exit();
+ return;
+ }
+ scheduler_ipi();
}
__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
- __smp_call_function_interrupt();
- exiting_irq();
-}
-
-__visible void __irq_entry
-smp_trace_call_function_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
- __smp_call_function_interrupt();
- trace_call_function_exit(CALL_FUNCTION_VECTOR);
- exiting_irq();
-}
-
-static inline void __smp_call_function_single_interrupt(void)
-{
- generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
-}
-
-__visible void __irq_entry
-smp_call_function_single_interrupt(struct pt_regs *regs)
-{
- ipi_entering_ack_irq();
- __smp_call_function_single_interrupt();
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
exiting_irq();
}
-__visible void __irq_entry
-smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
{
ipi_entering_ack_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
- __smp_call_function_single_interrupt();
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
exiting_irq();
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5f25cfbd952e..5ee663836c08 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
unsigned long addr, seg;
addr = regs->ip;
- seg = regs->cs & 0xffff;
+ seg = regs->cs;
if (v8086_mode(regs)) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index dcd699baea1b..a106b9719c58 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
- desc->a = desc->b = 0;
+ memset(desc, 0, sizeof(*desc));
} else {
fill_ldt(desc, info);
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 15515132bf0d..c6636d1f60b9 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -4,57 +4,38 @@
* Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
*
*/
-#include <asm/hw_irq.h>
-#include <asm/desc.h>
+#include <linux/jump_label.h>
#include <linux/atomic.h>
-atomic_t trace_idt_ctr = ATOMIC_INIT(0);
-struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) trace_idt_table };
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
-static int trace_irq_vector_refcount;
-static DEFINE_MUTEX(irq_vector_mutex);
+DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
-static void set_trace_idt_ctr(int val)
+int trace_pagefault_reg(void)
{
- atomic_set(&trace_idt_ctr, val);
- /* Ensure the trace_idt_ctr is set before sending IPI */
- wmb();
+ static_branch_inc(&trace_pagefault_key);
+ return 0;
}
-static void switch_idt(void *arg)
+void trace_pagefault_unreg(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
- load_current_idt();
- local_irq_restore(flags);
+ static_branch_dec(&trace_pagefault_key);
}
-int trace_irq_vector_regfunc(void)
+#ifdef CONFIG_SMP
+
+DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+
+int trace_resched_ipi_reg(void)
{
- mutex_lock(&irq_vector_mutex);
- if (!trace_irq_vector_refcount) {
- set_trace_idt_ctr(1);
- smp_call_function(switch_idt, NULL, 0);
- switch_idt(NULL);
- }
- trace_irq_vector_refcount++;
- mutex_unlock(&irq_vector_mutex);
+ static_branch_inc(&trace_resched_ipi_key);
return 0;
}
-void trace_irq_vector_unregfunc(void)
+void trace_resched_ipi_unreg(void)
{
- mutex_lock(&irq_vector_mutex);
- trace_irq_vector_refcount--;
- if (!trace_irq_vector_refcount) {
- set_trace_idt_ctr(0);
- smp_call_function(switch_idt, NULL, 0);
- switch_idt(NULL);
- }
- mutex_unlock(&irq_vector_mutex);
+ static_branch_dec(&trace_resched_ipi_key);
}
+
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309b85da..34ea3651362e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -38,11 +38,6 @@
#include <linux/smp.h>
#include <linux/io.h>
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -70,20 +65,13 @@
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
#include <asm/proto.h>
#endif
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
-
DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
static inline void cond_local_irq_enable(struct pt_regs *regs)
{
@@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
}
#endif
-/* Set of traps needed for early debugging. */
-void __init early_trap_init(void)
-{
- /*
- * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
- * is ready in cpu_init() <-- trap_init(). Before trap_init(),
- * CPU runs at ring 0 so it is impossible to hit an invalid
- * stack. Using the original stack works well enough at this
- * early stage. DEBUG_STACK will be equipped after cpu_init() in
- * trap_init().
- *
- * We don't need to set trace_idt_table like set_intr_gate(),
- * since we don't have trace_debug and it will be reset to
- * 'debug' in trap_init() by set_intr_gate_ist().
- */
- set_intr_gate_notrace(X86_TRAP_DB, debug);
- /* int3 can be called from all */
- set_system_intr_gate(X86_TRAP_BP, &int3);
-#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
- load_idt(&idt_descr);
-}
-
-void __init early_trap_pf_init(void)
-{
-#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
-}
-
void __init trap_init(void)
{
- int i;
-
-#ifdef CONFIG_EISA
- void __iomem *p = early_ioremap(0x0FFFD9, 4);
-
- if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
- EISA_bus = 1;
- early_iounmap(p, 4);
-#endif
-
- set_intr_gate(X86_TRAP_DE, divide_error);
- set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
- /* int4 can be called from all */
- set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, bounds);
- set_intr_gate(X86_TRAP_UD, invalid_op);
- set_intr_gate(X86_TRAP_NM, device_not_available);
-#ifdef CONFIG_X86_32
- set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
-#else
- set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
-#endif
- set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, invalid_TSS);
- set_intr_gate(X86_TRAP_NP, segment_not_present);
- set_intr_gate(X86_TRAP_SS, stack_segment);
- set_intr_gate(X86_TRAP_GP, general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, coprocessor_error);
- set_intr_gate(X86_TRAP_AC, alignment_check);
-#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
-#endif
- set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
-
-#ifdef CONFIG_X86_32
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
+ idt_setup_traps();
/*
* Set the IDT descriptor to a fixed read-only location, so that the
@@ -1030,20 +940,9 @@ void __init trap_init(void)
*/
cpu_init();
- /*
- * X86_TRAP_DB and X86_TRAP_BP have been set
- * in early_trap_init(). However, ITS works only after
- * cpu_init() loads TSS. See comments in early_trap_init().
- */
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
- /* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ idt_setup_ist_traps();
x86_init.irqs.trap_init();
-#ifdef CONFIG_X86_64
- memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
- set_nmi_gate(X86_TRAP_DB, &debug);
- set_nmi_gate(X86_TRAP_BP, &int3);
-#endif
+ idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..d145a0b1f529 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x) \
-({ \
- unsigned long val; \
- if (task == current) \
- val = READ_ONCE(x); \
- else \
- val = READ_ONCE_NOCHECK(x); \
- val; \
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
static void unwind_dump(struct unwind_state *state)
{
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
}
}
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
- if (unwind_done(state))
- return 0;
-
- return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
static size_t regs_size(struct pt_regs *regs)
{
/* x86_32 regs from kernel mode are two words shorter: */
@@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip)
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
-#endif
return false;
}
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
bool unwind_next_frame(struct unwind_state *state)
{
struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid = first, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ if (!orc_init)
+ return NULL;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+ idx, lookup_num_blocks, ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+ idx, lookup_num_blocks, start, stop, ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ struct orc_entry orc_tmp;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ orc_tmp = *orc_a;
+ *orc_a = *orc_b;
+ *orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be on the left
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */
+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+ orc_sort_swap);
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If the address isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the address
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, (void *)addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+ return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp, bool full)
+{
+ size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+ size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+ struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ if (!stack_access_ok(state, addr, regs_size))
+ return false;
+
+ *ip = regs->ip;
+ *sp = regs->sp;
+
+ return true;
+ }
+
+ if (!stack_access_ok(state, addr, sp_offset))
+ return false;
+
+ *ip = regs->ip;
+
+ if (user_mode(regs)) {
+ if (!stack_access_ok(state, addr + sp_offset,
+ REGS_SIZE - SP_OFFSET))
+ return false;
+
+ *sp = regs->sp;
+ } else
+ *sp = (unsigned long)&regs->sp;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ struct pt_regs *ptregs;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ preempt_disable();
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto done;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * Decrement call return addresses by one so they work for sibling
+ * calls and calls to noreturn functions.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+ goto done;
+ orig_ip = state->ip;
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R10 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r10;
+ break;
+
+ case ORC_REG_R13:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R13 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r13;
+ break;
+
+ case ORC_REG_DI:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DI at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->di;
+ break;
+
+ case ORC_REG_DX:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DX at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->dx;
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d for ip %p\n",
+ orc->sp_reg, (void *)state->ip);
+ goto done;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto done;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto done;
+
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ state->ip, (void *)ip_p);
+
+ state->sp = sp;
+ state->regs = NULL;
+ state->signal = false;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ orc_warn("can't dereference registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ state->regs = (struct pt_regs *)sp;
+ state->full_regs = true;
+ state->signal = true;
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ orc_warn("can't dereference iret registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ ptregs = container_of((void *)sp, struct pt_regs, ip);
+ if ((unsigned long)ptregs >= prev_sp &&
+ on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+ state->regs = ptregs;
+ state->full_regs = false;
+ } else
+ state->regs = NULL;
+
+ state->signal = true;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+ break;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (state->regs && state->full_regs)
+ state->bp = state->regs->bp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %p\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto done;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn("stack going in the wrong direction? ip=%p\n",
+ (void *)orig_ip);
+ goto done;
+ }
+
+ preempt_enable();
+ return true;
+
+done:
+ preempt_enable();
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto done;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto done;
+
+ state->ip = regs->ip;
+ state->sp = kernel_stack_pointer(regs);
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp;
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask))
+ return;
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+done:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
@@ -148,6 +149,8 @@ SECTIONS
BUG_TABLE
+ ORC_UNWIND_TABLE
+
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2eea522c..19adbb418443 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
/* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled)
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2595f9..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
-static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_pkru(vcpu);
-}
-
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..4b9a3ae6b725 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* index of the protection domain, so pte_pkey * 2 is
* is the index of the first bit for the domain.
*/
- pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
offset = (pfec & ~1) +
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 099ff08b4aff..8dbd8dbc83eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
-static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
-
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -5414,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .get_pkru = svm_get_pkru,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 416d5ed320b6..70b90c0810d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -636,8 +636,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- bool guest_pkru_valid;
- u32 guest_pkru;
u32 host_pkru;
/*
@@ -2383,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->guest_pkru;
-}
-
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -8786,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
desc = (gate_desc *)vmx->host_idt_base + vector;
- entry = gate_offset(*desc);
+ entry = gate_offset(desc);
asm volatile(
#ifdef CONFIG_X86_64
"mov %%" _ASM_SP ", %[sp]\n\t"
@@ -9020,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (vmx->guest_pkru_valid)
- __write_pkru(vmx->guest_pkru);
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -9169,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
- if (boot_cpu_has(X86_FEATURE_OSPKE)) {
- vmx->guest_pkru = __read_pkru();
- if (vmx->guest_pkru != vmx->host_pkru) {
- vmx->guest_pkru_valid = true;
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
- } else
- vmx->guest_pkru_valid = false;
}
/*
@@ -11682,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .get_pkru = vmx_get_pkru,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eda4bdbd7e5e..ef5102f80497 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3246,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest + offset, src, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
}
valid -= feature;
@@ -3284,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest, src + offset, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
}
valid -= feature;
@@ -6726,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
- /*
- * The physical address of apic access page is stored in the VMCS.
- * Update it when it becomes invalid.
- */
- if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -7634,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
trace_kvm_fpu(1);
}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
- bool "Lguest guest support"
- depends on X86_32 && PARAVIRT && PCI
- select TTY
- select VIRTUALIZATION
- select VIRTIO
- select VIRTIO_CONSOLE
- help
- Lguest is a tiny in-kernel hypervisor. Selecting this will
- allow your kernel to boot under lguest. This option will increase
- your kernel size by about 10k. If in doubt, say N.
-
- If you say Y here, make sure you say Y (or M) to the virtio block
- and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y := head_32.o boot.o
-CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways. First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes. We call the first kernel the Host,
- * and the others the Guests. The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time. This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest? We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h> /* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways. In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
- .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
- .noirq_iret = (u32)lguest_noirq_iret,
- .kernel_address = PAGE_OFFSET,
- .blocked_interrupts = { 1 }, /* Block timer interrupts */
- .syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really. We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly. This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* Note: This code assumes we're uniprocessor. */
- static unsigned int next_call;
- unsigned long flags;
-
- /*
- * Disable interrupts if not already disabled: we don't want an
- * interrupt handler making a hypercall while we're already doing
- * one!
- */
- local_irq_save(flags);
- if (lguest_data.hcall_status[next_call] != 0xFF) {
- /* Table full, so do normal hcall which will flush table. */
- hcall(call, arg1, arg2, arg3, arg4);
- } else {
- lguest_data.hcalls[next_call].arg0 = call;
- lguest_data.hcalls[next_call].arg1 = arg1;
- lguest_data.hcalls[next_call].arg2 = arg2;
- lguest_data.hcalls[next_call].arg3 = arg3;
- lguest_data.hcalls[next_call].arg4 = arg4;
- /* Arguments must all be written before we mark it to go */
- wmb();
- lguest_data.hcall_status[next_call] = 0;
- if (++next_call == LHCALL_RING_SIZE)
- next_call = 0;
- }
- local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall(). This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off. Because hypercalls
- * are reasonably expensive, batching them up makes sense. For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, 0, 0, 0);
- else
- async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
- unsigned long arg1,
- unsigned long arg2)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, 0, 0);
- else
- async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, 0);
- else
- async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, arg4);
- else
- async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction. The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags"). The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag. Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
- return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
- lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment. Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules. In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register. To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch. One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled. There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag. If an interrupt does come
- * in, we then disable them for real. This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in. Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares? The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
- int entrynum, const gate_desc *g)
-{
- /*
- * The gate_desc structure is 8 bytes long: we hand it to the Host in
- * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
- * around like this; typesafety wasn't a big concern in Linux's early
- * years.
- */
- u32 *desc = (u32 *)g;
- /* Keep the local copy up to date. */
- native_write_idt_entry(dt, entrynum, g);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *idt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table. There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *gdt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
- const void *desc, int type)
-{
- native_write_gdt_entry(dt, entrynum, desc, type);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
- dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables). As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- /*
- * There's one problem which normal hardware doesn't have: the Host
- * can't handle us removing entries we're currently using. So we clear
- * the GS register here: if it's needed it'll be reloaded anyway.
- */
- lazy_load_gs(0);
- lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy. Linux only
- * uses this for some strange applications like Wine. We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment. Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops. So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features. It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 6 languages. I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction. Then I pretty much turned
- * off feature bits until the Guest booted. (Don't say that: you'll damage
- * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
- * hardly future proof.) No one's listening! They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged. So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int function = *ax;
-
- native_cpuid(ax, bx, cx, dx);
- switch (function) {
- /*
- * CPUID 0 gives the highest legal CPUID number (and the ID string).
- * We futureproof our code a little by sticking to known CPUID values.
- */
- case 0:
- if (*ax > 5)
- *ax = 5;
- break;
-
- /*
- * CPUID 1 is a basic feature request.
- *
- * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
- * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
- */
- case 1:
- *cx &= 0x00002201;
- *dx &= 0x07808151;
- /*
- * The Host can do a nice optimization if it knows that the
- * kernel mappings (addresses above 0xC0000000 or whatever
- * PAGE_OFFSET is set to) haven't changed. But Linux calls
- * flush_tlb_user() for both user and kernel mappings unless
- * the Page Global Enable (PGE) feature bit is set.
- */
- *dx |= 0x00002000;
- /*
- * We also lie, and say we're family id 5. 6 or greater
- * leads to a rdmsr in early_init_intel which we can't handle.
- * Family ID is returned as bits 8-12 in ax.
- */
- *ax &= 0xFFFFF0FF;
- *ax |= 0x00000500;
- break;
-
- /*
- * This is used to detect if we're running under KVM. We might be,
- * but that's a Host matter, not us. So say we're not.
- */
- case KVM_CPUID_SIGNATURE:
- *bx = *cx = *dx = 0;
- break;
-
- /*
- * 0x80000000 returns the highest Extended Function, so we futureproof
- * like we do above by limiting it to known fields.
- */
- case 0x80000000:
- if (*ax > 0x80000008)
- *ax = 0x80000008;
- break;
-
- /*
- * PAE systems can mark pages as non-executable. Linux calls this the
- * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch it off here, since we don't
- * support it.
- */
- case 0x80000001:
- *dx &= ~(1 << 20);
- break;
- }
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it. The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0. cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used. Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
- return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads. The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
- return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
- lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- current_cr3 = cr3;
-
- /* These two page tables are simple, linear, and used during boot */
- if (cr3 != __pa_symbol(swapper_pg_dir) &&
- cr3 != __pa_symbol(initial_page_table))
- cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
- return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
- return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant. The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
- * maps virtual addresses to physical addresses using "page tables". We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables. But since most virtual addresses
- * are unused, we use a two level index which saves space. The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages. Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page. If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- * Index into top Index into second Offset within page
- * page directory page pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper ]
- * [ Level ]
- * [ Entries ]
- * [(PUD Page)]---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- *
- * And the virtual address is decoded as:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into Index into mid Index into lower Offset within page
- * top entries directory page pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages. The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. We tell the Host the toplevel and
- * address this corresponds to. The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
- /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
- lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
- ptep->pte_low, ptep->pte_high);
-#else
- lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
- native_set_pud(pudp, pudval);
-
- /* 32 bytes aligned pdpt address and the index. */
- lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
- (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more. This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them. Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds! So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry. These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- native_set_pte_atomic(ptep, pte);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- native_pte_clear(mm, addr, ptep);
- lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
- lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations. On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present). Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
- /* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed. That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit. We don't actually "ack" interrupts as such, we
- * just mask and unmask them. I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
- set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
- clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
- .name = "lguest",
- .irq_mask = disable_lguest_irq,
- .irq_mask_ack = disable_lguest_irq,
- .irq_unmask = enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
- struct irq_desc *desc;
- int err;
-
- /* Returns -ve error or vector number. */
- err = irq_alloc_desc_at(irq, 0);
- if (err < 0 && err != -EEXIST)
- return err;
-
- /*
- * Tell the Linux infrastructure that the interrupt is
- * controlled by our level-based lguest interrupt controller.
- */
- irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
-
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- desc = irq_to_desc(irq);
- __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
- return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
- int err;
- u8 line = 0;
-
- /* We literally use the PCI interrupt line as the irq number. */
- pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
- err = lguest_setup_irq(line);
- if (!err)
- dev->irq = line;
- return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
- WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
- unsigned int i;
-
- for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
- if (i != IA32_SYSCALL_VECTOR)
- set_intr_gate(i, irq_entries_start +
- 8 * (i - FIRST_EXTERNAL_VECTOR));
- }
-
- /*
- * This call is required to set up for 4k stacks, where we have
- * separate stacks for hard and soft interrupts.
- */
- irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
- *now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
- return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
- unsigned long sec, nsec;
-
- /*
- * Since the time is in two parts (seconds and nanoseconds), we risk
- * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
- * and getting 99 and 0. As Linux tends to come apart under the stress
- * of time travel, we must be careful:
- */
- do {
- /* First we read the seconds part. */
- sec = lguest_data.time.tv_sec;
- /*
- * This read memory barrier tells the compiler and the CPU that
- * this can't be reordered: we have to complete the above
- * before going on.
- */
- rmb();
- /* Now we read the nanoseconds part. */
- nsec = lguest_data.time.tv_nsec;
- /* Make sure we've done that. */
- rmb();
- /* Now if the seconds part has changed, try again. */
- } while (unlikely(lguest_data.time.tv_sec != sec));
-
- /* Our lguest clock is in real nanoseconds. */
- return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
- .name = "lguest",
- .rating = 200,
- .read = lguest_clock_read,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future. Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* FIXME: I don't think this can ever happen, but James tells me he had
- * to put this code in. Maybe we should remove it now. Anyone? */
- if (delta < LG_CLOCK_MIN_DELTA) {
- if (printk_ratelimit())
- printk(KERN_DEBUG "%s: small delta %lu ns\n",
- __func__, delta);
- return -ETIME;
- }
-
- /* Please wake us this far in the future. */
- hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
- return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
- /* A 0 argument shuts the clock down. */
- hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
- return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
- .name = "lguest",
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .set_next_event = lguest_clockevent_set_next_event,
- .set_state_shutdown = lguest_clockevent_shutdown,
- .rating = INT_MAX,
- .mult = 1,
- .shift = 0,
- .min_delta_ns = LG_CLOCK_MIN_DELTA,
- .min_delta_ticks = LG_CLOCK_MIN_DELTA,
- .max_delta_ns = LG_CLOCK_MAX_DELTA,
- .max_delta_ticks = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0). We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
- unsigned long flags;
-
- /* Don't interrupt us while this is running. */
- local_irq_save(flags);
- lguest_clockevent.event_handler(&lguest_clockevent);
- local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure. The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
- /* Set up the timer interrupt (0) to go to our simple timer routine */
- if (lguest_setup_irq(0) != 0)
- panic("Could not set up timer irq");
- irq_set_handler(0, lguest_time_irq);
-
- clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
- /* We can't set cpumask in the initializer: damn C limitations! Set it
- * here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of(0);
- clockevents_register_device(&lguest_clockevent);
-
- /* Finally, we unblock the timer interrupt. */
- clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work. They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use. For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
- THREAD_SIZE / PAGE_SIZE);
- tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
- /* FIXME: Implement */
- return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
- /* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices). This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that. Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads. So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced. It will also never interrupt anything. It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
- return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
- return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
- return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
- return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
- apic->read = lguest_apic_read;
- apic->write = lguest_apic_write;
- apic->icr_read = lguest_apic_icr_read;
- apic->icr_write = lguest_apic_icr_write;
- apic->wait_icr_idle = lguest_apic_wait_icr_idle;
- apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP! Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
- hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
- hcall(LHCALL_SHUTDOWN, __pa("Power down"),
- LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't. But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
- hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
- /* The hcall won't return, but to keep gcc happy, we're "done". */
- return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
- .notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
- /*
- * The Linux bootloader header contains an "e820" memory map: the
- * Launcher populated the first entry with our memory limit.
- */
- e820__range_add(boot_params.e820_table[0].addr,
- boot_params.e820_table[0].size,
- boot_params.e820_table[0].type);
-
- /* This string is for the boot messages. */
- return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
- write_pci_config_byte(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, bar),
- 0);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, length),
- 4);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, offset),
- off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
- /*
- * We could set this up once, then leave it; nothing else in the *
- * kernel should touch these registers. But if it went wrong, that
- * would be a horrible bug to find.
- */
- set_cfg_window(cfg_offset, off);
- write_pci_config(0, 1, 0,
- cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
- u8 cap, common_cap = 0, device_cap = 0;
- u32 device_len;
-
- /* Avoid recursive printk into here. */
- console_cfg_offset = -1;
-
- if (!early_pci_allowed()) {
- printk(KERN_ERR "lguest: early PCI access not allowed!\n");
- return;
- }
-
- /* We expect a console PCI device at BUS0, slot 1. */
- if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
- printk(KERN_ERR "lguest: PCI device is %#x!\n",
- read_pci_config(0, 1, 0, 0));
- return;
- }
-
- /* Find the capabilities we need (must be in bar0) */
- cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
- while (cap) {
- u8 vndr = read_pci_config_byte(0, 1, 0, cap);
- if (vndr == PCI_CAP_ID_VNDR) {
- u8 type, bar;
-
- type = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, cfg_type));
- bar = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, bar));
-
- switch (type) {
- case VIRTIO_PCI_CAP_DEVICE_CFG:
- if (bar == 0)
- device_cap = cap;
- break;
- case VIRTIO_PCI_CAP_PCI_CFG:
- console_access_cap = cap;
- break;
- }
- }
- cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
- }
- if (!device_cap || !console_access_cap) {
- printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
- common_cap, device_cap, console_access_cap);
- return;
- }
-
- /*
- * Note that we can't check features, until we've set the DRIVER
- * status bit. We don't want to do that until we have a real driver,
- * so we just check that the device-specific config has room for
- * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
- * it should ignore the access.
- */
- device_len = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, length));
- if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
- + sizeof(u32))) {
- printk(KERN_ERR "lguest: console missing emerg_wr field\n");
- return;
- }
-
- console_cfg_offset = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, offset));
- printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
- /* If we couldn't find PCI console, forget it. */
- if (console_cfg_offset < 0)
- return count;
-
- if (unlikely(!console_cfg_offset)) {
- probe_pci_console();
- if (console_cfg_offset < 0)
- return count;
- }
-
- write_bar_via_cfg(console_access_cap,
- console_cfg_offset
- + offsetof(struct virtio_console_config, emerg_wr),
- buf[0]);
- return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
- hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel. This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"? The rest of that quote is
- * "... But that usually will create another problem." This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient. We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
- const char *start, *end;
-} lguest_insns[] = {
- [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c). If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
- unsigned long addr, unsigned len)
-{
- unsigned int insn_len;
-
- /* Don't do anything special if we don't have a replacement */
- if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
- /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
- if (len < insn_len)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- /* Copy in our instructions. */
- memcpy(ibuf, lguest_insns[type].start, insn_len);
- return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest. The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
- /* We're under lguest. */
- pv_info.name = "lguest";
- /* We're running at privilege level 1, not 0 as normal. */
- pv_info.kernel_rpl = 1;
- /* Everyone except Xen runs with this set. */
- pv_info.shared_kernel_pmd = 1;
-
- /*
- * We set up all the lguest overrides for sensitive operations. These
- * are detailed with the operations themselves.
- */
-
- /* Interrupt-related operations */
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
- pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
- pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
- pv_irq_ops.safe_halt = lguest_safe_halt;
-
- /* Setup operations */
- pv_init_ops.patch = lguest_patch;
-
- /* Intercepts of various CPU instructions */
- pv_cpu_ops.load_gdt = lguest_load_gdt;
- pv_cpu_ops.cpuid = lguest_cpuid;
- pv_cpu_ops.load_idt = lguest_load_idt;
- pv_cpu_ops.iret = lguest_iret;
- pv_cpu_ops.load_sp0 = lguest_load_sp0;
- pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
- pv_cpu_ops.set_ldt = lguest_set_ldt;
- pv_cpu_ops.load_tls = lguest_load_tls;
- pv_cpu_ops.get_debugreg = lguest_get_debugreg;
- pv_cpu_ops.set_debugreg = lguest_set_debugreg;
- pv_cpu_ops.read_cr0 = lguest_read_cr0;
- pv_cpu_ops.write_cr0 = lguest_write_cr0;
- pv_cpu_ops.read_cr4 = lguest_read_cr4;
- pv_cpu_ops.write_cr4 = lguest_write_cr4;
- pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
- pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
- pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
- pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
- /* Pagetable management */
- pv_mmu_ops.write_cr3 = lguest_write_cr3;
- pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
- pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
- pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
- pv_mmu_ops.set_pte = lguest_set_pte;
- pv_mmu_ops.set_pte_at = lguest_set_pte_at;
- pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
- pv_mmu_ops.pte_clear = lguest_pte_clear;
- pv_mmu_ops.pmd_clear = lguest_pmd_clear;
- pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
- pv_mmu_ops.read_cr2 = lguest_read_cr2;
- pv_mmu_ops.read_cr3 = lguest_read_cr3;
- pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
- pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
- pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* APIC read/write intercepts */
- set_lguest_basic_apic_ops();
-#endif
-
- x86_init.resources.memory_setup = lguest_memory_setup;
- x86_init.irqs.intr_init = lguest_init_IRQ;
- x86_init.timers.timer_init = lguest_time_init;
- x86_platform.calibrate_tsc = lguest_tsc_khz;
- x86_platform.get_wallclock = lguest_get_wallclock;
-
- /*
- * Now is a good time to look at the implementations of these functions
- * before returning to the rest of lguest_init().
- */
-
- /*G:070
- * Now we've seen all the paravirt_ops, we return to
- * lguest_init() where the rest of the fairly chaotic boot setup
- * occurs.
- */
-
- /*
- * The stack protector is a weird thing where gcc places a canary
- * value on the stack and then checks it on return. This file is
- * compiled with -fno-stack-protector it, so we got this far without
- * problems. The value of the canary is kept at offset 20 from the
- * %gs register, so we need to set that up before calling C functions
- * in other files.
- */
- setup_stack_canary_segment(0);
-
- /*
- * We could just call load_stack_canary_segment(), but we might as well
- * call switch_to_new_gdt() which loads the whole table and sets up the
- * per-cpu segment descriptor register %fs as well.
- */
- switch_to_new_gdt(0);
-
- /*
- * The Host<->Guest Switcher lives at the top of our address space, and
- * the Host told us how big it is when we made LGUEST_INIT hypercall:
- * it put the answer in lguest_data.reserve_mem
- */
- reserve_top_address(lguest_data.reserve_mem);
-
- /* Hook in our special panic hypercall code. */
- atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
- /*
- * This is messy CPU setup stuff which the native boot code does before
- * start_kernel, so we have to do, too:
- */
- cpu_detect(&new_cpu_data);
- /* head.S usually sets up the first capability word, so do it here. */
- new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
- /* Math is always hard! */
- set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
- /* We don't have features. We have puppies! Puppies! */
-#ifdef CONFIG_X86_MCE
- mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
- acpi_disabled = 1;
-#endif
-
- /*
- * We set the preferred console to "hvc". This is the "hypervisor
- * virtual console" driver written by the PowerPC people, which we also
- * adapted for lguest's use.
- */
- add_preferred_console("hvc", 0, NULL);
-
- /* Register our very early console. */
- virtio_cons_early_init(early_put_chars);
-
- /* Don't let ACPI try to control our PCI interrupts. */
- disable_acpi();
-
- /* We control them ourselves, by overriding these two hooks. */
- pcibios_enable_irq = lguest_enable_irq;
- pcibios_disable_irq = lguest_disable_irq;
-
- /*
- * Last of all, we set the power management poweroff hook to point to
- * the Guest routine to power off, and the reboot hook to our restart
- * routine.
- */
- pm_power_off = lguest_power_off;
- machine_ops.restart = lguest_restart;
-
- /*
- * Now we're set up, call i386_start_kernel() in head32.c and we proceed
- * to boot as normal. It never returns.
- */
- i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables. Finally it checks the 'hardware_subarch' field. This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here! We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
- /*
- * We make the "initialization" hypercall now to tell the Host where
- * our lguest_data struct is.
- */
- movl $LHCALL_LGUEST_INIT, %eax
- movl $lguest_data - __PAGE_OFFSET, %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
- movl $LHCALL_NEW_PGTABLE, %eax
- movl $(initial_page_table - __PAGE_OFFSET), %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Set up the initial stack so we can run C code. */
- movl $(init_thread_union+THREAD_SIZE),%esp
-
- /* Jumps are relative: we're running __PAGE_OFFSET too low. */
- jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers. These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...) \
- lgstart_##name: insns; lgend_##name:; \
- .globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later). If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages. First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
- /*
- * The reverse of irq_disable, this sets lguest_data.irq_enabled to
- * X86_EFLAGS_IF (ie. "Interrupts enabled").
- */
- movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * But now we need to check if the Host wants to know: there might have
- * been interrupts waiting to be delivered, in which case it will have
- * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
- * jump to send_interrupts, otherwise we're done.
- */
- cmpl $0, lguest_data+LGUEST_DATA_irq_pending
- jnz send_interrupts
- /*
- * One cool thing about x86 is that you can do many things without using
- * a register. In this case, the normal path hasn't needed to save or
- * restore any registers at all!
- */
- ret
-send_interrupts:
- /*
- * OK, now we need a register: eax is used for the hypercall number,
- * which is LHCALL_SEND_INTERRUPTS.
- *
- * We used not to bother with this pending detection at all, which was
- * much simpler. Sooner or later the Host would realize it had to
- * send us an interrupt. But that turns out to make performance 7
- * times worse on a simple tcp benchmark. So now we do this the hard
- * way.
- */
- pushl %eax
- movl $LHCALL_SEND_INTERRUPTS, %eax
- /* This is the actual hypercall trap. */
- int $LGUEST_TRAP_ENTRY
- /* Put eax back the way we found it. */
- popl %eax
- ret
-
-/*
- * Finally, the "popf" or "restore flags" routine. The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
- /* This is just "lguest_data.irq_enabled = flags;" */
- movl %eax, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * Now, if the %eax value has enabled interrupts and
- * lguest_data.irq_pending is set, we want to tell the Host so it can
- * deliver any outstanding interrupts. Fortunately, both values will
- * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
- * instruction will AND them together for us. If both are set, we
- * jump to send_interrupts.
- */
- testl lguest_data+LGUEST_DATA_irq_pending, %eax
- jnz send_interrupts
- /* Again, the normal path has used no extra registers. Clever, huh? */
- ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it. However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway. If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last. It's *cool*! It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap. The
- * stack looks like this:
- * old address
- * old code segment & privilege level
- * old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once. The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic. The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever. Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
- pushl 2*4(%esp)
- /*
- * Note the %ss: segment prefix here. Normal data accesses use the
- * "ds" segment, but that will have already been restored for whatever
- * we're returning to (such as userspace): we can't trust it. The %ss:
- * prefix makes sure we use the stack segment, which is still valid.
- */
- popl %ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
- iret
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index f77ba3058b31..066996dba6a2 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -363,3 +363,4 @@ L_bugged_2:
pop %ebx
jmp L_exit
#endif /* PARANOID */
+ENDPROC(div_Xsig)
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index 47099628fa4c..2c71527bd917 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,4 +44,4 @@ ENTRY(FPU_div_small)
leave
ret
-
+ENDPROC(FPU_div_small)
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 0203baefb5c0..d4a7df2205b8 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info)
}
code_descriptor = FPU_get_ldt_descriptor(FPU_CS);
- if (SEG_D_SIZE(code_descriptor)) {
+ if (code_descriptor.d) {
/* The above test may be wrong, the book is not clear */
/* Segmented 32 bit protected mode */
addr_modes.default_mode = SEG32;
@@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info)
/* 16 bit protected mode */
addr_modes.default_mode = PM16;
}
- FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
- code_limit = code_base
- + (SEG_LIMIT(code_descriptor) +
- 1) * SEG_GRANULARITY(code_descriptor)
- - 1;
+ FPU_EIP += code_base = seg_get_base(&code_descriptor);
+ code_limit = seg_get_limit(&code_descriptor) + 1;
+ code_limit *= seg_get_granularity(&code_descriptor);
+ code_limit += code_base - 1;
if (code_limit < code_base)
code_limit = 0xffffffff;
}
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index a179254a5122..699f329f1d40 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
return ret;
}
-#define SEG_D_SIZE(x) ((x).b & (3 << 21))
-#define SEG_G_BIT(x) ((x).b & (1 << 23))
-#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1)
-#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23)))
-#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \
- | (((s).b & 0xff) << 16) | ((s).a >> 16))
-#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff))
-#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11))
-#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9))
-#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
- == (1 << 10))
+#define SEG_TYPE_WRITABLE (1U << 1)
+#define SEG_TYPE_EXPANDS_DOWN (1U << 2)
+#define SEG_TYPE_EXECUTE (1U << 3)
+#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE)
+#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE)
+
+static inline unsigned long seg_get_base(struct desc_struct *d)
+{
+ unsigned long base = (unsigned long)d->base2 << 24;
+
+ return base | ((unsigned long)d->base1 << 16) | d->base0;
+}
+
+static inline unsigned long seg_get_limit(struct desc_struct *d)
+{
+ return ((unsigned long)d->limit1 << 16) | d->limit0;
+}
+
+static inline unsigned long seg_get_granularity(struct desc_struct *d)
+{
+ return d->g ? 4096 : 1;
+}
+
+static inline bool seg_expands_down(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN;
+}
+
+static inline bool seg_execute_only(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE;
+}
+
+static inline bool seg_writable(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
+}
#define I387 (&current->thread.fpu.state)
#define FPU_info (I387->soft.info)
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index b8ef9f9d2ffc..c48967c6a0e2 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment,
}
descriptor = FPU_get_ldt_descriptor(addr->selector);
- base_address = SEG_BASE_ADDR(descriptor);
+ base_address = seg_get_base(&descriptor);
address = base_address + offset;
- limit = base_address
- + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1;
+ limit = seg_get_limit(&descriptor) + 1;
+ limit *= seg_get_granularity(&descriptor);
+ limit += base_address - 1;
if (limit < base_address)
limit = 0xffffffff;
- if (SEG_EXPAND_DOWN(descriptor)) {
- if (SEG_G_BIT(descriptor))
+ if (seg_expands_down(&descriptor)) {
+ if (descriptor.g) {
seg_top = 0xffffffff;
- else {
+ } else {
seg_top = base_address + (1 << 20);
if (seg_top < base_address)
seg_top = 0xffffffff;
@@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment,
(address > limit) || (address < base_address) ? 0 :
((limit - address) >= 254 ? 255 : limit - address + 1);
}
- if (SEG_EXECUTE_ONLY(descriptor) ||
- (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
+ if (seg_execute_only(&descriptor) ||
+ (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
access_limit = 0;
}
return address;
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 717785a53eb4..22e0631bb85a 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,6 +62,7 @@ ENTRY(mul32_Xsig)
popl %esi
leave
ret
+ENDPROC(mul32_Xsig)
ENTRY(mul64_Xsig)
@@ -114,6 +115,7 @@ ENTRY(mul64_Xsig)
popl %esi
leave
ret
+ENDPROC(mul64_Xsig)
@@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig)
popl %esi
leave
ret
-
+ENDPROC(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 17315c89ff3d..a9aaf414135d 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,3 +133,4 @@ L_accum_done:
popl %esi
leave
ret
+ENDPROC(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index 8b6352efceef..53ac1a343c69 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -94,6 +94,7 @@ L_overflow:
call arith_overflow
pop %ebx
jmp L_exit
+ENDPROC(FPU_normalize)
@@ -145,3 +146,4 @@ L_exit_nuo_zero:
popl %ebx
leave
ret
+ENDPROC(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index d1d4e48b4f67..41af5b208d88 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -706,3 +706,5 @@ L_exception_exit:
mov $-1,%eax
jmp fpu_reg_round_special_exit
#endif /* PARANOID */
+
+ENDPROC(FPU_round)
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 47c4c2434d85..3b1bc5e9b2f6 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -165,3 +165,4 @@ L_exit:
leave
ret
#endif /* PARANOID */
+ENDPROC(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index cc00654b6f9a..796eb5ab921b 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -469,3 +469,5 @@ L_exit:
leave
ret
#endif /* PARANOID */
+
+ENDPROC(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 973f12af97df..6196f68cf3c1 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -146,3 +146,4 @@ L_exit:
ret
#endif /* PARANOID */
+ENDPROC(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index 1b6c24801d22..d115b900919a 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,3 +270,4 @@ L_exit:
popl %esi
leave
ret
+ENDPROC(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index bbe0e87718e4..87c99749a495 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
popl %ebx
leave
ret
-
+ENDPROC(round_Xsig)
@@ -138,4 +138,4 @@ L_n_exit:
popl %ebx
leave
ret
-
+ENDPROC(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 31cdd118e918..c8552edeec75 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -85,3 +85,4 @@ L_more_than_95:
popl %esi
leave
ret
+ENDPROC(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 518428317985..340dd6897f85 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -92,6 +92,7 @@ L_more_than_95:
popl %esi
leave
ret
+ENDPROC(FPU_shrx)
/*---------------------------------------------------------------------------+
@@ -202,3 +203,4 @@ Ls_more_than_95:
popl %esi
leave
ret
+ENDPROC(FPU_shrxs)
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
index d258f59564e1..695afae38fdf 100644
--- a/arch/x86/math-emu/wm_sqrt.S
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -468,3 +468,4 @@ sqrt_more_prec_large:
/* Our estimate is too large */
movl $0x7fffff00,%eax
jmp sqrt_round_result
+ENDPROC(wm_sqrt)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..c076f710de4c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fault);
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ /* First unconditionally saturate the refcount. */
+ *(int *)regs->cx = INT_MIN / 2;
+
+ /*
+ * Strictly speaking, this reports the fixup destination, not
+ * the fault location, and not the actually overflowing
+ * instruction, which is the instruction before the "js", but
+ * since that instruction could be a variety of lengths, just
+ * report the location after the overflow, which should be close
+ * enough for finding the overflow, as it's at least back in
+ * the function, having returned from .text.unlikely.
+ */
+ regs->ip = ex_fixup_addr(fixup);
+
+ /*
+ * This function has been called because either a negative refcount
+ * value was seen by any of the refcount functions, or a zero
+ * refcount value was seen by refcount_dec().
+ *
+ * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+ * wrapped around) will be set. Additionally, seeing the refcount
+ * reach 0 will set ZF (Zero Flag: result was zero). In each of
+ * these cases we want a report, since it's a boundary condition.
+ *
+ */
+ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+ bool zero = regs->flags & X86_EFLAGS_ZF;
+
+ refcount_error_report(regs, zero ? "hit zero" : "overflow");
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
@@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
* undefined. I'm not sure which CPUs do this, but at least
* the 486 DX works this way.
*/
- if ((regs->cs & 0xFFFF) != __KERNEL_CS)
+ if (regs->cs != __KERNEL_CS)
goto fail;
/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0cdf14cf3270..b836a7274e12 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1258,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
- *
- * This function must have noinline because both callers
- * {,trace_}do_page_fault() have notrace on. Having this an actual function
- * guarantees there's a function trace entry.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
@@ -1494,27 +1490,6 @@ good_area:
}
NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
-{
- unsigned long address = read_cr2(); /* Get the faulting address */
- enum ctx_state prev_state;
-
- /*
- * We must have this function tagged with __kprobes, notrace and call
- * read_cr2() before calling anything else. To avoid calling any kind
- * of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contain all sorts of tracepoints.
- */
-
- prev_state = exception_enter();
- __do_page_fault(regs, error_code, address);
- exception_exit(prev_state);
-}
-NOKPROBE_SYMBOL(do_page_fault);
-
-#ifdef CONFIG_TRACING
static nokprobe_inline void
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
@@ -1525,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
trace_page_fault_kernel(address, regs, error_code);
}
+/*
+ * We must have this function blacklisted from kprobes, tagged with notrace
+ * and call read_cr2() before calling anything else. To avoid calling any
+ * kind of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contains all sorts of tracepoints.
+ */
dotraplinkage void notrace
-trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
- /*
- * The exception_enter and tracepoint processing could
- * trigger another page faults (user space callchain
- * reading) and destroy the original cr2 value, so read
- * the faulting address now.
- */
- unsigned long address = read_cr2();
+ unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
prev_state = exception_enter();
- trace_page_fault_entries(address, regs, error_code);
+ if (trace_pagefault_enabled())
+ trace_page_fault_entries(address, regs, error_code);
+
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
-NOKPROBE_SYMBOL(trace_do_page_fault);
-#endif /* CONFIG_TRACING */
+NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce3dedf..d805162e6045 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr. The return value is the number of nodes allocated.
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, int nr_nodes)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 size;
int big;
int nid = 0;
@@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
return -1;
}
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
@@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'. The return value is the number of nodes allocated.
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, u64 size)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 min_size;
int nid = 0;
int i, ret;
@@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
}
size &= FAKE_NODE_MIN_HASH_MASK;
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
@@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return 0;
}
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+ int i, max_emu_nid = 0;
+
+ *dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (*dfl_phys_nid == NUMA_NO_NODE)
+ *dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+
+ return max_emu_nid;
+}
+
/**
* numa_emulation - Emulate NUMA nodes
* @numa_meminfo: NUMA configuration to massage
@@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* Determine the max emulated nid and the default phys nid to use
* for unmapped nodes.
*/
- max_emu_nid = 0;
- dfl_phys_nid = NUMA_NO_NODE;
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
- if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
- max_emu_nid = i;
- if (dfl_phys_nid == NUMA_NO_NODE)
- dfl_phys_nid = emu_nid_to_phys[i];
- }
- }
- if (dfl_phys_nid == NUMA_NO_NODE) {
- pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
- goto no_emu;
- }
+ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
/* commit */
*numa_meminfo = ei;
+ /* Make sure numa_nodes_parsed only contains emulated nodes */
+ nodes_clear(numa_nodes_parsed);
+ for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+ if (ei.blk[i].start != ei.blk[i].end &&
+ ei.blk[i].nid != NUMA_NO_NODE)
+ node_set(ei.blk[i].nid, numa_nodes_parsed);
+
/*
* Transform __apicid_to_node table to use emulated nids by
* reverse-mapping phys_nid. The maps should always exist but fall
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ce104b962a17..dbbcfd59726a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -214,6 +214,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/*
+ * Call this when reinitializing a CPU. It fixes the following potential
+ * problems:
+ *
+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely
+ * because the CPU was taken down and came back up with CR3's PCID
+ * bits clear. CPU hotplug can do this.
+ *
+ * - The TLB contains junk in slots corresponding to inactive ASIDs.
+ *
+ * - The CPU went so far out to lunch that it may have missed a TLB
+ * flush.
+ */
+void initialize_tlbstate_and_flush(void)
+{
+ int i;
+ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long cr3 = __read_cr3();
+
+ /* Assert that CR3 already references the right mm. */
+ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+
+ /*
+ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
+ * doesn't work like other CR4 bits because it can only be set from
+ * long mode.)
+ */
+ WARN_ON(boot_cpu_has(X86_CR4_PCIDE) &&
+ !(cr4_read_shadow() & X86_CR4_PCIDE));
+
+ /* Force ASID 0 and force a TLB flush. */
+ write_cr3(cr3 & ~CR3_PCID_MASK);
+
+ /* Reinitialize tlbstate. */
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+
+ for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
+ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
+}
+
+/*
* flush_tlb_func_common()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e1324f280e06..8c9573660d51 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -94,7 +94,9 @@ static int bpf_size_to_x86_bytes(int bpf_size)
#define X86_JNE 0x75
#define X86_JBE 0x76
#define X86_JA 0x77
+#define X86_JL 0x7C
#define X86_JGE 0x7D
+#define X86_JLE 0x7E
#define X86_JG 0x7F
static void bpf_flush_icache(void *start, void *end)
@@ -285,7 +287,7 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */
offsetof(struct bpf_array, map.max_entries));
EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */
-#define OFFSET1 47 /* number of bytes to jump */
+#define OFFSET1 43 /* number of bytes to jump */
EMIT2(X86_JBE, OFFSET1); /* jbe out */
label1 = cnt;
@@ -294,21 +296,20 @@ static void emit_bpf_tail_call(u8 **pprog)
*/
EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 36
+#define OFFSET2 32
EMIT2(X86_JA, OFFSET2); /* ja out */
label2 = cnt;
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */
/* prog = array->ptrs[index]; */
- EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
+ EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
offsetof(struct bpf_array, ptrs));
- EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
/* if (prog == NULL)
* goto out;
*/
- EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
#define OFFSET3 10
EMIT2(X86_JE, OFFSET3); /* je out */
label3 = cnt;
@@ -888,9 +889,13 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_JEQ | BPF_X:
case BPF_JMP | BPF_JNE | BPF_X:
case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JLT | BPF_X:
case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JLE | BPF_X:
case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSLT | BPF_X:
case BPF_JMP | BPF_JSGE | BPF_X:
+ case BPF_JMP | BPF_JSLE | BPF_X:
/* cmp dst_reg, src_reg */
EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
add_2reg(0xC0, dst_reg, src_reg));
@@ -911,9 +916,13 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
/* cmp dst_reg, imm8/32 */
EMIT1(add_1mod(0x48, dst_reg));
@@ -935,18 +944,34 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
/* GT is unsigned '>', JA in x86 */
jmp_cond = X86_JA;
break;
+ case BPF_JLT:
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = X86_JB;
+ break;
case BPF_JGE:
/* GE is unsigned '>=', JAE in x86 */
jmp_cond = X86_JAE;
break;
+ case BPF_JLE:
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = X86_JBE;
+ break;
case BPF_JSGT:
/* signed '>', GT in x86 */
jmp_cond = X86_JG;
break;
+ case BPF_JSLT:
+ /* signed '<', LT in x86 */
+ jmp_cond = X86_JL;
+ break;
case BPF_JSGE:
/* signed '>=', GE in x86 */
jmp_cond = X86_JGE;
break;
+ case BPF_JSLE:
+ /* signed '<=', LE in x86 */
+ jmp_cond = X86_JLE;
+ break;
default: /* to silence gcc warning */
return -EFAULT;
}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 78459a6d455a..4d68d59f457d 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -181,6 +181,7 @@ static void fix_processor_context(void)
#endif
load_TR_desc(); /* This does ltr */
load_mm_ldt(current->active_mm); /* This does lldt */
+ initialize_tlbstate_and_flush();
fpu__resume_cpu();
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ae4cd58c0c7a..02250b2633b8 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -50,7 +50,7 @@ void foo(void)
DEFINE(HOST_GS, GS);
DEFINE(HOST_ORIG_AX, ORIG_EAX);
#else
-#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET)
+#ifdef FP_XSTATE_MAGIC1
DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
#else
DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index df1921751aa5..ae2a2e2d6362 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
- return d1->a == d2->a && d1->b == d2->b;
+ return !memcmp(d1, d2, sizeof(*d1));
}
static void load_TLS_descriptor(struct thread_struct *t,
@@ -586,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
preempt_enable();
}
+#ifdef CONFIG_X86_64
+struct trap_array_entry {
+ void (*orig)(void);
+ void (*xen)(void);
+ bool ist_okay;
+};
+
+static struct trap_array_entry trap_array[] = {
+ { debug, xen_xendebug, true },
+ { int3, xen_xenint3, true },
+ { double_fault, xen_double_fault, true },
+#ifdef CONFIG_X86_MCE
+ { machine_check, xen_machine_check, true },
+#endif
+ { nmi, xen_nmi, true },
+ { overflow, xen_overflow, false },
+#ifdef CONFIG_IA32_EMULATION
+ { entry_INT80_compat, xen_entry_INT80_compat, false },
+#endif
+ { page_fault, xen_page_fault, false },
+ { divide_error, xen_divide_error, false },
+ { bounds, xen_bounds, false },
+ { invalid_op, xen_invalid_op, false },
+ { device_not_available, xen_device_not_available, false },
+ { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
+ { invalid_TSS, xen_invalid_TSS, false },
+ { segment_not_present, xen_segment_not_present, false },
+ { stack_segment, xen_stack_segment, false },
+ { general_protection, xen_general_protection, false },
+ { spurious_interrupt_bug, xen_spurious_interrupt_bug, false },
+ { coprocessor_error, xen_coprocessor_error, false },
+ { alignment_check, xen_alignment_check, false },
+ { simd_coprocessor_error, xen_simd_coprocessor_error, false },
+};
+
+static bool get_trap_addr(void **addr, unsigned int ist)
+{
+ unsigned int nr;
+ bool ist_okay = false;
+
+ /*
+ * Replace trap handler addresses by Xen specific ones.
+ * Check for known traps using IST and whitelist them.
+ * The debugger ones are the only ones we care about.
+ * Xen will handle faults like double_fault, * so we should never see
+ * them. Warn if there's an unexpected IST-using fault handler.
+ */
+ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
+ struct trap_array_entry *entry = trap_array + nr;
+
+ if (*addr == entry->orig) {
+ *addr = entry->xen;
+ ist_okay = entry->ist_okay;
+ break;
+ }
+ }
+
+ if (WARN_ON(ist != 0 && !ist_okay))
+ return false;
+
+ return true;
+}
+#endif
+
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
unsigned long addr;
- if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
+ if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- addr = gate_offset(*val);
+ addr = gate_offset(val);
#ifdef CONFIG_X86_64
- /*
- * Look for known traps using IST, and substitute them
- * appropriately. The debugger ones are the only ones we care
- * about. Xen will handle faults like double_fault,
- * so we should never see them. Warn if
- * there's an unexpected IST-using fault handler.
- */
- if (addr == (unsigned long)debug)
- addr = (unsigned long)xen_debug;
- else if (addr == (unsigned long)int3)
- addr = (unsigned long)xen_int3;
- else if (addr == (unsigned long)stack_segment)
- addr = (unsigned long)xen_stack_segment;
- else if (addr == (unsigned long)double_fault) {
- /* Don't need to handle these */
+ if (!get_trap_addr((void **)&addr, val->bits.ist))
return 0;
-#ifdef CONFIG_X86_MCE
- } else if (addr == (unsigned long)machine_check) {
- /*
- * when xen hypervisor inject vMCE to guest,
- * use native mce handler to handle it
- */
- ;
-#endif
- } else if (addr == (unsigned long)nmi)
- /*
- * Use the native version as well.
- */
- ;
- else {
- /* Some other trap using IST? */
- if (WARN_ON(val->ist != 0))
- return 0;
- }
#endif /* CONFIG_X86_64 */
info->address = addr;
- info->cs = gate_segment(*val);
- info->flags = val->dpl;
+ info->cs = gate_segment(val);
+ info->flags = val->bits.dpl;
/* interrupt gates clear IF */
- if (val->type == GATE_INTERRUPT)
+ if (val->bits.type == GATE_INTERRUPT)
info->flags |= 1 << 2;
return 1;
@@ -988,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void)
}
}
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
-{
- char *start, *end, *reloc;
- unsigned ret;
-
- start = end = reloc = NULL;
-
-#define SITE(op, x) \
- case PARAVIRT_PATCH(op.x): \
- if (xen_have_vcpu_info_placement) { \
- start = (char *)xen_##x##_direct; \
- end = xen_##x##_direct_end; \
- reloc = xen_##x##_direct_reloc; \
- } \
- goto patch_site
-
- switch (type) {
- SITE(pv_irq_ops, irq_enable);
- SITE(pv_irq_ops, irq_disable);
- SITE(pv_irq_ops, save_fl);
- SITE(pv_irq_ops, restore_fl);
-#undef SITE
-
- patch_site:
- if (start == NULL || (end-start) > len)
- goto default_patch;
-
- ret = paravirt_patch_insns(insnbuf, len, start, end);
-
- /* Note: because reloc is assigned from something that
- appears to be an array, gcc assumes it's non-null,
- but doesn't know its relationship with start and
- end. */
- if (reloc > start && reloc < end) {
- int reloc_off = reloc - start;
- long *relocp = (long *)(insnbuf + reloc_off);
- long delta = start - (char *)addr;
-
- *relocp += delta;
- }
- break;
-
- default_patch:
- default:
- ret = paravirt_patch_default(type, clobbers, insnbuf,
- addr, len);
- break;
- }
-
- return ret;
-}
-
static const struct pv_info xen_info __initconst = {
.shared_kernel_pmd = 0,
@@ -1050,10 +1029,6 @@ static const struct pv_info xen_info __initconst = {
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initconst = {
- .patch = xen_patch,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
@@ -1251,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_init_ops = xen_init_ops;
+ pv_init_ops.patch = paravirt_patch_default;
pv_cpu_ops = xen_cpu_ops;
x86_platform.get_nmi_reason = xen_get_nmi_reason;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 33e92955e09d..d4eff5676cfa 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
.safe_halt = xen_safe_halt,
.halt = xen_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
};
void __init xen_init_irq_ops(void)
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index eff224df813f..dcd31fa39b5d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/asm-offsets.h>
@@ -16,7 +10,7 @@
#include <asm/processor-flags.h>
#include <asm/frame.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
/*
* Enable events. This clears the event mask and tests the pending
@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_irq_enable_direct)
FRAME_END
ret
ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
/*
@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
+ENDPROC(xen_irq_disable_direct)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
-ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
/*
@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_restore_fl_direct)
FRAME_END
ret
ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
/*
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
deleted file mode 100644
index 465276467a47..000000000000
--- a/arch/x86/xen/xen-asm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _XEN_XEN_ASM_H
-#define _XEN_XEN_ASM_H
-
-#include <linux/linkage.h>
-
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index feb6d40a0860..1200e262a116 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/thread_info.h>
@@ -18,21 +12,10 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
/*
* This is run where a normal iret would be run, with the same stack setup:
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..dae2cc33afb5 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/errno.h>
@@ -20,13 +14,44 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
+
+.macro xen_pv_trap name
+ENTRY(xen_\name)
+ pop %rcx
+ pop %r11
+ jmp \name
+END(xen_\name)
+.endm
-ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp), %rcx
- mov 8+8(%rsp), %r11
- ret $16
-ENDPROC(xen_adjust_exception_frame)
+xen_pv_trap divide_error
+xen_pv_trap debug
+xen_pv_trap xendebug
+xen_pv_trap int3
+xen_pv_trap xenint3
+xen_pv_trap nmi
+xen_pv_trap overflow
+xen_pv_trap bounds
+xen_pv_trap invalid_op
+xen_pv_trap device_not_available
+xen_pv_trap double_fault
+xen_pv_trap coprocessor_segment_overrun
+xen_pv_trap invalid_TSS
+xen_pv_trap segment_not_present
+xen_pv_trap stack_segment
+xen_pv_trap general_protection
+xen_pv_trap page_fault
+xen_pv_trap spurious_interrupt_bug
+xen_pv_trap coprocessor_error
+xen_pv_trap alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+xen_pv_trap hypervisor_callback
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
@@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
*/
ENTRY(xen_iret)
pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_iret)
-RELOC(xen_iret, 1b+1)
+ jmp hypercall_iret
ENTRY(xen_sysret64)
/*
@@ -65,9 +88,7 @@ ENTRY(xen_sysret64)
pushq %rcx
pushq $VGCF_in_syscall
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret64)
-RELOC(xen_sysret64, 1b+1)
+ jmp hypercall_iret
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
@@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1)
* rip
* r11
* rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
-.macro undo_xen_syscall
- mov 0*8(%rsp), %rcx
- mov 1*8(%rsp), %r11
- mov 5*8(%rsp), %rsp
-.endm
-
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
- undo_xen_syscall
- jmp entry_SYSCALL_64_after_swapgs
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
- undo_xen_syscall
- jmp entry_SYSCALL_compat
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
- undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..c8a6d224f7ed 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void)
}
#endif
-/* Declare an asm function, along with symbols needed to make it
- inlineable */
-#define DECL_ASM(ret, name, ...) \
- __visible ret name(__VA_ARGS__); \
- extern char name##_end[] __visible; \
- extern char name##_reloc[] __visible
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
-__visible void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);